From 7db1953ec0abf973a4e28cbc2ffdf9f6b66fa7e4 Mon Sep 17 00:00:00 2001 From: Patryk Strusiewicz-Surmacki Date: Mon, 25 Mar 2024 18:26:48 +0100 Subject: [PATCH] Added support for gradual rollout Signed-off-by: Patryk Strusiewicz-Surmacki --- Dockerfile | 19 +- Makefile | 49 +- agent.Dockerfile | 37 ++ api/v1alpha1/networkconfigrevision_types.go | 102 ++++ api/v1alpha1/nodenetworkconfig_types.go | 80 +++ api/v1alpha1/zz_generated.deepcopy.go | 199 ++++++++ cmd/{manager => agent}/main.go | 80 ++- cmd/operator/main.go | 185 +++++++ .../manager.yaml => agent/agent.yaml} | 16 +- .../agent_master.yaml} | 8 +- config/{manager => agent}/config.yaml | 0 .../controller_agent_config.yaml} | 0 config/{manager => agent}/kustomization.yaml | 12 +- config/{manager => agent}/namespace.yaml | 0 config/{manager => agent}/service.yaml | 0 config/certmanager/kustomization.yaml | 3 + ...iff.telekom.de_networkconfigrevisions.yaml | 320 ++++++++++++ ....schiff.telekom.de_nodenetworkconfigs.yaml | 283 +++++++++++ config/crd/kustomization.yaml | 2 + config/default/kustomization.yaml | 3 +- config/default/manager_auth_proxy_patch.yaml | 2 +- config/default/manager_config_patch.yaml | 16 +- .../default/manager_master_config_patch.yaml | 14 +- .../default/manager_master_metrics_patch.yaml | 2 +- .../default/manager_master_webhook_patch.yaml | 2 +- config/default/manager_metrics_patch.yaml | 4 +- config/default/manager_webhook_patch.yaml | 4 +- config/operator/kustomization.yaml | 12 + config/operator/operator.yaml | 65 +++ config/rbac/role.yaml | 52 ++ ...ion_controller.go => config_controller.go} | 44 +- controllers/nodenetworkconfig_controller.go | 85 ++++ ...e_controller.go => revision_controller.go} | 34 +- .../vrfrouteconfiguration_controller.go | 63 --- frr-exporter.Dockerfile | 2 +- go.mod | 3 +- pkg/frr/dbus/dbus.go | 2 +- .../dbus/mock/{mock_frr.go => mock_dbus.go} | 0 pkg/frr/manager.go | 15 + pkg/frr/mock/mock_frr.go | 121 +++++ pkg/healthcheck/healthcheck.go | 54 +- pkg/healthcheck/healthcheck_test.go | 50 +- pkg/managerconfig/managerconfig_test.go | 8 +- pkg/reconciler/config_reconciler.go | 192 ++++++++ pkg/reconciler/configrevision_reconciler.go | 466 ++++++++++++++++++ pkg/reconciler/layer2.go | 71 +-- pkg/reconciler/layer3.go | 46 +- .../nodenetworkconfig_reconciler.go | 274 ++++++++++ pkg/reconciler/reconciler.go | 132 ----- pkg/reconciler/reconciler_test.go | 392 +++++++++++++++ 50 files changed, 3158 insertions(+), 467 deletions(-) create mode 100644 agent.Dockerfile create mode 100644 api/v1alpha1/networkconfigrevision_types.go create mode 100644 api/v1alpha1/nodenetworkconfig_types.go rename cmd/{manager => agent}/main.go (86%) create mode 100644 cmd/operator/main.go rename config/{manager/manager.yaml => agent/agent.yaml} (92%) rename config/{manager/manager_master.yaml => agent/agent_master.yaml} (96%) rename config/{manager => agent}/config.yaml (100%) rename config/{manager/controller_manager_config.yaml => agent/controller_agent_config.yaml} (100%) rename config/{manager => agent}/kustomization.yaml (66%) rename config/{manager => agent}/namespace.yaml (100%) rename config/{manager => agent}/service.yaml (100%) create mode 100644 config/crd/bases/network.schiff.telekom.de_networkconfigrevisions.yaml create mode 100644 config/crd/bases/network.schiff.telekom.de_nodenetworkconfigs.yaml create mode 100644 config/operator/kustomization.yaml create mode 100644 config/operator/operator.yaml rename controllers/{layer2networkconfiguration_controller.go => config_controller.go} (58%) create mode 100644 controllers/nodenetworkconfig_controller.go rename controllers/{routingtable_controller.go => revision_controller.go} (55%) delete mode 100644 controllers/vrfrouteconfiguration_controller.go rename pkg/frr/dbus/mock/{mock_frr.go => mock_dbus.go} (100%) create mode 100644 pkg/frr/mock/mock_frr.go create mode 100644 pkg/reconciler/config_reconciler.go create mode 100644 pkg/reconciler/configrevision_reconciler.go create mode 100644 pkg/reconciler/nodenetworkconfig_reconciler.go delete mode 100644 pkg/reconciler/reconciler.go create mode 100644 pkg/reconciler/reconciler_test.go diff --git a/Dockerfile b/Dockerfile index a02d970d..f372dd0b 100644 --- a/Dockerfile +++ b/Dockerfile @@ -1,5 +1,5 @@ # Build the manager binary -FROM docker.io/library/golang:1.21-alpine as builder +FROM docker.io/library/golang:1.21-alpine AS builder WORKDIR /workspace @@ -10,28 +10,19 @@ COPY go.sum go.sum # and so that source changes don't invalidate our downloaded layer RUN go mod download -# Build router -RUN apk add llvm clang linux-headers libbpf-dev musl-dev - # Copy the go source -COPY cmd/manager/main.go main.go +COPY cmd/operator/main.go main.go COPY api/ api/ COPY controllers/ controllers/ COPY pkg/ pkg/ -# Build router -COPY bpf/ bpf/ -RUN cd pkg/bpf/ && go generate - # Build -RUN CGO_ENABLED=0 GOOS=linux GOARCH=amd64 go build -a -o manager main.go +RUN CGO_ENABLED=0 GOOS=linux GOARCH=amd64 go build -a -o operator main.go FROM alpine:latest -RUN apk add --no-cache iptables ip6tables - WORKDIR / -COPY --from=builder /workspace/manager . +COPY --from=builder /workspace/operator . USER 65532:65532 -ENTRYPOINT ["/manager"] +ENTRYPOINT ["/operator"] diff --git a/Makefile b/Makefile index 623026d8..394eed32 100644 --- a/Makefile +++ b/Makefile @@ -1,8 +1,10 @@ -# Image URL to use all building/pushing image targets -IMG ?= ghcr.io/telekom/das-schiff-network-operator:latest +# Agent image URL to use all building/pushing image targets +AGENT_IMG ?= ghcr.io/telekom/das-schiff-network-operator-agent:latest # Sidecar image URL to use all building/pushing image targets SIDECAR_IMG ?= ghcr.io/telekom/frr-exporter:latest +# Operator image URL to use all building/pushing image targets +OPERATOR_IMG ?= ghcr.io/telekom/das-schiff-network-opeator:latest # ENVTEST_K8S_VERSION refers to the version of kubebuilder assets to be downloaded by envtest binary. ENVTEST_K8S_VERSION = 1.25 @@ -68,8 +70,18 @@ test: manifests generate fmt vet envtest ## Run tests. ##@ Build .PHONY: build -build: generate fmt vet ## Build manager binary. - go build -o bin/manager cmd/manager/main.go +build: generate fmt vet ## Build agent binary. + go build -o bin/operator cmd/operator/main.go + go build -o bin/agent cmd/agent/main.go + go build -o bin/frr-exporter cmd/frr-exporter/main.go + +.PHONY: operator-build +operator-build: generate fmt vet ## Build agent binary. + go build -o bin/operator cmd/operator/main.go + +.PHONY: agent-build +agent-build: generate fmt vet ## Build agent binary. + go build -o bin/agent cmd/agent/main.go .PHONY: sidecar-build sidecar-build: build @@ -77,24 +89,40 @@ sidecar-build: build .PHONY: run run: manifests generate fmt vet ## Run a controller from your host. - go run ./cmd/manager/main.go + go run ./cmd/agent/main.go .PHONY: docker-build docker-build: test ## Build docker image with the manager. - docker build -t ${IMG} . + docker build -t ${OPERATOR_IMG} . + docker build -t ${AGENT_IMG} -f agent.Dockerfile . + docker build -t ${SIDECAR_IMG} -f frr-exporter.Dockerfile . + +.PHONY: docker-build-agent +docker-build-agent: test ## Build docker image with the manager. + docker build -t ${AGENT_IMG} -f agent.Dockerfile . .PHONY: docker-build-sidecar docker-build-sidecar: test ## Build docker image with the manager. docker build -t ${SIDECAR_IMG} -f frr-exporter.Dockerfile . +.PHONY: docker-build-operator +docker-build-operator: test ## Build docker image with the manager. + docker build -t ${OPERATOR_IMG} . + .PHONY: docker-push -docker-push: ## Push docker image with the manager. - docker push ${IMG} +docker-push: docker-push-agent docker-push-sidecar docker-push-operator + +.PHONY: docker-push-agent +docker-push-agent: ## Push docker image with the manager. + docker push ${AGENT_IMG} .PHONY: docker-push-sidecar docker-push-sidecar: ## Push docker image with the manager. docker push ${SIDECAR_IMG} +.PHONY: docker-push-operator +docker-push-operator: ## Push docker image with the manager. + docker push ${OPERATOR_IMG} ##@ Release @@ -133,8 +161,9 @@ uninstall-certs: manifests kustomize ## Uninstall certs .PHONY: deploy deploy: manifests kustomize ## Deploy controller to the K8s cluster specified in ~/.kube/config. - cd config/manager && $(KUSTOMIZE) edit set image controller=${IMG} - cd config/manager && $(KUSTOMIZE) edit set image frr-exporter=${SIDECAR_IMG} + cd config/agent && $(KUSTOMIZE) edit set image agent=${AGENT_IMG} + cd config/agent && $(KUSTOMIZE) edit set image frr-exporter=${SIDECAR_IMG} + cd config/operator && $(KUSTOMIZE) edit set image operator=${OPERATOR_IMG} $(KUSTOMIZE) build config/default | kubectl apply -f - .PHONY: undeploy diff --git a/agent.Dockerfile b/agent.Dockerfile new file mode 100644 index 00000000..54774b12 --- /dev/null +++ b/agent.Dockerfile @@ -0,0 +1,37 @@ +# Build the manager binary +FROM docker.io/library/golang:1.21-alpine AS builder + + +WORKDIR /workspace +# Copy the Go Modules manifests +COPY go.mod go.mod +COPY go.sum go.sum +# cache deps before building and copying source so that we don't need to re-download as much +# and so that source changes don't invalidate our downloaded layer +RUN go mod download + +# Build router +RUN apk add llvm clang linux-headers libbpf-dev musl-dev + +# Copy the go source +COPY cmd/agent/main.go main.go +COPY api/ api/ +COPY controllers/ controllers/ +COPY pkg/ pkg/ + +# Build router +COPY bpf/ bpf/ +RUN cd pkg/bpf/ && go generate + +# Build +RUN CGO_ENABLED=0 GOOS=linux GOARCH=amd64 go build -a -o agent main.go + +FROM alpine:latest + +RUN apk add --no-cache iptables ip6tables + +WORKDIR / +COPY --from=builder /workspace/agent . +USER 65532:65532 + +ENTRYPOINT ["/agent"] diff --git a/api/v1alpha1/networkconfigrevision_types.go b/api/v1alpha1/networkconfigrevision_types.go new file mode 100644 index 00000000..905db700 --- /dev/null +++ b/api/v1alpha1/networkconfigrevision_types.go @@ -0,0 +1,102 @@ +/* +Copyright 2024. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package v1alpha1 + +import ( + "crypto/sha256" + "encoding/hex" + "encoding/json" + "fmt" + + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" +) + +// NetworkConfigSpec defines the desired state of NetworkConfig. +type NetworkConfigRevisionSpec struct { + // Config stores global configuration of the nodes. + Config NodeNetworkConfigSpec `json:"config"` + // Revision is a hash of the NetworkConfigRevision object that is used to identify the particular revision. + Revision string `json:"revision"` +} + +type NetworkConfigRevisionStatus struct { + // IsInvalid determines if NetworkConfigRevision results in misconfigured nodes (invalid configuration). + IsInvalid bool `json:"isInvalid"` + // Ready informs about how many nodes were already provisioned with a config derived from the revision. + Ready int `json:"ready"` + // Ongoing informs about how many nodes are currently provisioned with a config derived from the revision. + Ongoing int `json:"ongoing"` + // Queued informs about how many nodes are currently waiting to be provisiined with a config derived from the revision. + Queued int `json:"queued"` + // Total informs about how many nodes in total can be provisiined with a config derived from the revision. + Total int `json:"total"` +} + +//+kubebuilder:object:root=true +//+kubebuilder:subresource:status +//+kubebuilder:resource:shortName=ncr,scope=Cluster +//+kubebuilder:printcolumn:name="Invalid",type=string,JSONPath=`.status.isInvalid` +//+kubebuilder:printcolumn:name="Queued",type="integer",JSONPath=".status.queued" +//+kubebuilder:printcolumn:name="Ongoing",type="integer",JSONPath=".status.ongoing" +//+kubebuilder:printcolumn:name="Ready",type="integer",JSONPath=".status.ready" +//+kubebuilder:printcolumn:name="Total",type="integer",JSONPath=".status.total" +//+kubebuilder:printcolumn:name="Age",type="date",JSONPath=".metadata.creationTimestamp" + +// NetworkConfigRevision is the Schema for the node configuration. +type NetworkConfigRevision struct { + metav1.TypeMeta `json:",inline"` + metav1.ObjectMeta `json:"metadata,omitempty"` + + Spec NetworkConfigRevisionSpec `json:"spec,omitempty"` + Status NetworkConfigRevisionStatus `json:"status,omitempty"` +} + +//+kubebuilder:object:root=true + +// NetworkConfigRevisionList contains a list of NetworkConfigRevision. +type NetworkConfigRevisionList struct { + metav1.TypeMeta `json:",inline"` + metav1.ListMeta `json:"metadata,omitempty"` + Items []NetworkConfigRevision `json:"items"` +} + +func NewRevision(config *NodeNetworkConfig) (*NetworkConfigRevision, error) { + data, err := json.Marshal(config.Spec) + if err != nil { + return nil, fmt.Errorf("error marshalling data: %w", err) + } + + h := sha256.New() + if _, err := h.Write(data); err != nil { + return nil, fmt.Errorf("failed hashing network config: %w", err) + } + hash := h.Sum(nil) + hashHex := hex.EncodeToString(hash) + + return &NetworkConfigRevision{ + ObjectMeta: metav1.ObjectMeta{Name: hashHex[:10]}, + Spec: NetworkConfigRevisionSpec{ + Config: config.Spec, + Revision: hashHex, + }, + Status: NetworkConfigRevisionStatus{}, + }, nil +} + +func init() { + SchemeBuilder.Register(&NetworkConfigRevision{}, &NetworkConfigRevisionList{}) +} diff --git a/api/v1alpha1/nodenetworkconfig_types.go b/api/v1alpha1/nodenetworkconfig_types.go new file mode 100644 index 00000000..2fe85bba --- /dev/null +++ b/api/v1alpha1/nodenetworkconfig_types.go @@ -0,0 +1,80 @@ +/* +Copyright 2024. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package v1alpha1 + +import ( + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" +) + +// NodeNetworkConfigSpec defines the desired state of NodeConfig. +type NodeNetworkConfigSpec struct { + // Revision stores hash of the NodeConfigRevision that was used to create the NodeNetworkConfig object. + Revision string `json:"revision"` + Layer2 []Layer2NetworkConfigurationSpec `json:"layer2"` + Vrf []VRFRouteConfigurationSpec `json:"vrf"` + RoutingTable []RoutingTableSpec `json:"routingTable"` +} + +// NodeNetworkConfigStatus defines the observed state of NodeConfig. +type NodeNetworkConfigStatus struct { + // ConfigStatus describes provisioning state od the NodeConfig. Can be either 'provisioning' or 'provisioned'. + ConfigStatus string `json:"configStatus"` + // LastUpdate determines when last update (change) of the ConfigStatus field took place. + LastUpdate metav1.Time `json:"lastUpdate"` +} + +//+kubebuilder:object:root=true +//+kubebuilder:subresource:status +//+kubebuilder:resource:shortName=nnc,scope=Cluster +//+kubebuilder:printcolumn:name="Status",type=string,JSONPath=`.status.configStatus` +//+kubebuilder:printcolumn:name="Age",type="date",JSONPath=".metadata.creationTimestamp" + +// NodeNetworkConfig is the Schema for the node configuration. +type NodeNetworkConfig struct { + metav1.TypeMeta `json:",inline"` + metav1.ObjectMeta `json:"metadata,omitempty"` + + Spec NodeNetworkConfigSpec `json:"spec,omitempty"` + Status NodeNetworkConfigStatus `json:"status,omitempty"` +} + +//+kubebuilder:object:root=true + +// NodeNetworkConfigList contains a list of NodeConfig. +type NodeNetworkConfigList struct { + metav1.TypeMeta `json:",inline"` + metav1.ListMeta `json:"metadata,omitempty"` + Items []NodeNetworkConfig `json:"items"` +} + +func NewEmptyConfig(name string) *NodeNetworkConfig { + return &NodeNetworkConfig{ + ObjectMeta: metav1.ObjectMeta{Name: name}, + Spec: NodeNetworkConfigSpec{ + Vrf: []VRFRouteConfigurationSpec{}, + Layer2: []Layer2NetworkConfigurationSpec{}, + RoutingTable: []RoutingTableSpec{}, + }, + Status: NodeNetworkConfigStatus{ + ConfigStatus: "", + }, + } +} + +func init() { + SchemeBuilder.Register(&NodeNetworkConfig{}, &NodeNetworkConfigList{}) +} diff --git a/api/v1alpha1/zz_generated.deepcopy.go b/api/v1alpha1/zz_generated.deepcopy.go index ecaa9148..afb46ad8 100644 --- a/api/v1alpha1/zz_generated.deepcopy.go +++ b/api/v1alpha1/zz_generated.deepcopy.go @@ -129,6 +129,205 @@ func (in *Layer2NetworkConfigurationStatus) DeepCopy() *Layer2NetworkConfigurati return out } +// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. +func (in *NetworkConfigRevision) DeepCopyInto(out *NetworkConfigRevision) { + *out = *in + out.TypeMeta = in.TypeMeta + in.ObjectMeta.DeepCopyInto(&out.ObjectMeta) + in.Spec.DeepCopyInto(&out.Spec) + out.Status = in.Status +} + +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new NetworkConfigRevision. +func (in *NetworkConfigRevision) DeepCopy() *NetworkConfigRevision { + if in == nil { + return nil + } + out := new(NetworkConfigRevision) + in.DeepCopyInto(out) + return out +} + +// DeepCopyObject is an autogenerated deepcopy function, copying the receiver, creating a new runtime.Object. +func (in *NetworkConfigRevision) DeepCopyObject() runtime.Object { + if c := in.DeepCopy(); c != nil { + return c + } + return nil +} + +// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. +func (in *NetworkConfigRevisionList) DeepCopyInto(out *NetworkConfigRevisionList) { + *out = *in + out.TypeMeta = in.TypeMeta + in.ListMeta.DeepCopyInto(&out.ListMeta) + if in.Items != nil { + in, out := &in.Items, &out.Items + *out = make([]NetworkConfigRevision, len(*in)) + for i := range *in { + (*in)[i].DeepCopyInto(&(*out)[i]) + } + } +} + +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new NetworkConfigRevisionList. +func (in *NetworkConfigRevisionList) DeepCopy() *NetworkConfigRevisionList { + if in == nil { + return nil + } + out := new(NetworkConfigRevisionList) + in.DeepCopyInto(out) + return out +} + +// DeepCopyObject is an autogenerated deepcopy function, copying the receiver, creating a new runtime.Object. +func (in *NetworkConfigRevisionList) DeepCopyObject() runtime.Object { + if c := in.DeepCopy(); c != nil { + return c + } + return nil +} + +// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. +func (in *NetworkConfigRevisionSpec) DeepCopyInto(out *NetworkConfigRevisionSpec) { + *out = *in + in.Config.DeepCopyInto(&out.Config) +} + +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new NetworkConfigRevisionSpec. +func (in *NetworkConfigRevisionSpec) DeepCopy() *NetworkConfigRevisionSpec { + if in == nil { + return nil + } + out := new(NetworkConfigRevisionSpec) + in.DeepCopyInto(out) + return out +} + +// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. +func (in *NetworkConfigRevisionStatus) DeepCopyInto(out *NetworkConfigRevisionStatus) { + *out = *in +} + +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new NetworkConfigRevisionStatus. +func (in *NetworkConfigRevisionStatus) DeepCopy() *NetworkConfigRevisionStatus { + if in == nil { + return nil + } + out := new(NetworkConfigRevisionStatus) + in.DeepCopyInto(out) + return out +} + +// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. +func (in *NodeNetworkConfig) DeepCopyInto(out *NodeNetworkConfig) { + *out = *in + out.TypeMeta = in.TypeMeta + in.ObjectMeta.DeepCopyInto(&out.ObjectMeta) + in.Spec.DeepCopyInto(&out.Spec) + in.Status.DeepCopyInto(&out.Status) +} + +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new NodeNetworkConfig. +func (in *NodeNetworkConfig) DeepCopy() *NodeNetworkConfig { + if in == nil { + return nil + } + out := new(NodeNetworkConfig) + in.DeepCopyInto(out) + return out +} + +// DeepCopyObject is an autogenerated deepcopy function, copying the receiver, creating a new runtime.Object. +func (in *NodeNetworkConfig) DeepCopyObject() runtime.Object { + if c := in.DeepCopy(); c != nil { + return c + } + return nil +} + +// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. +func (in *NodeNetworkConfigList) DeepCopyInto(out *NodeNetworkConfigList) { + *out = *in + out.TypeMeta = in.TypeMeta + in.ListMeta.DeepCopyInto(&out.ListMeta) + if in.Items != nil { + in, out := &in.Items, &out.Items + *out = make([]NodeNetworkConfig, len(*in)) + for i := range *in { + (*in)[i].DeepCopyInto(&(*out)[i]) + } + } +} + +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new NodeNetworkConfigList. +func (in *NodeNetworkConfigList) DeepCopy() *NodeNetworkConfigList { + if in == nil { + return nil + } + out := new(NodeNetworkConfigList) + in.DeepCopyInto(out) + return out +} + +// DeepCopyObject is an autogenerated deepcopy function, copying the receiver, creating a new runtime.Object. +func (in *NodeNetworkConfigList) DeepCopyObject() runtime.Object { + if c := in.DeepCopy(); c != nil { + return c + } + return nil +} + +// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. +func (in *NodeNetworkConfigSpec) DeepCopyInto(out *NodeNetworkConfigSpec) { + *out = *in + if in.Layer2 != nil { + in, out := &in.Layer2, &out.Layer2 + *out = make([]Layer2NetworkConfigurationSpec, len(*in)) + for i := range *in { + (*in)[i].DeepCopyInto(&(*out)[i]) + } + } + if in.Vrf != nil { + in, out := &in.Vrf, &out.Vrf + *out = make([]VRFRouteConfigurationSpec, len(*in)) + for i := range *in { + (*in)[i].DeepCopyInto(&(*out)[i]) + } + } + if in.RoutingTable != nil { + in, out := &in.RoutingTable, &out.RoutingTable + *out = make([]RoutingTableSpec, len(*in)) + copy(*out, *in) + } +} + +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new NodeNetworkConfigSpec. +func (in *NodeNetworkConfigSpec) DeepCopy() *NodeNetworkConfigSpec { + if in == nil { + return nil + } + out := new(NodeNetworkConfigSpec) + in.DeepCopyInto(out) + return out +} + +// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. +func (in *NodeNetworkConfigStatus) DeepCopyInto(out *NodeNetworkConfigStatus) { + *out = *in + in.LastUpdate.DeepCopyInto(&out.LastUpdate) +} + +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new NodeNetworkConfigStatus. +func (in *NodeNetworkConfigStatus) DeepCopy() *NodeNetworkConfigStatus { + if in == nil { + return nil + } + out := new(NodeNetworkConfigStatus) + in.DeepCopyInto(out) + return out +} + // DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. func (in *RoutingTable) DeepCopyInto(out *RoutingTable) { *out = *in diff --git a/cmd/manager/main.go b/cmd/agent/main.go similarity index 86% rename from cmd/manager/main.go rename to cmd/agent/main.go index 536390ab..f7b738a2 100644 --- a/cmd/manager/main.go +++ b/cmd/agent/main.go @@ -34,6 +34,7 @@ import ( "github.com/telekom/das-schiff-network-operator/pkg/anycast" "github.com/telekom/das-schiff-network-operator/pkg/bpf" "github.com/telekom/das-schiff-network-operator/pkg/config" + "github.com/telekom/das-schiff-network-operator/pkg/frr" "github.com/telekom/das-schiff-network-operator/pkg/healthcheck" "github.com/telekom/das-schiff-network-operator/pkg/macvlan" "github.com/telekom/das-schiff-network-operator/pkg/managerconfig" @@ -92,6 +93,7 @@ func main() { var onlyBPFMode bool var configFile string var interfacePrefix string + var nodeConfigPath string flag.StringVar(&configFile, "config", "", "The controller will load its initial configuration from this file. "+ "Omit this flag to use the default configuration values. "+ @@ -100,6 +102,8 @@ func main() { "Only attach BPF to specified interfaces in config. This will not start any reconciliation. Perfect for masters.") flag.StringVar(&interfacePrefix, "macvlan-interface-prefix", "", "Interface prefix for bridge devices for MACVlan sync") + flag.StringVar(&nodeConfigPath, "nodeconfig-path", reconciler.DefaultNodeConfigPath, + "Path to store working node configuration.") opts := zap.Options{ Development: true, } @@ -107,27 +111,14 @@ func main() { flag.Parse() ctrl.SetLogger(zap.New(zap.UseFlagOptions(&opts))) - var err error - var options manager.Options - if configFile != "" { - options, err = managerconfig.Load(configFile, scheme) - if err != nil { - setupLog.Error(err, "unable to load the config file") - os.Exit(1) - } - } else { - options = ctrl.Options{Scheme: scheme} - } - if options.MetricsBindAddress != "0" && options.MetricsBindAddress != "" { - err = initCollectors() - if err != nil { - setupLog.Error(err, "unable to initialize metrics collectors") - os.Exit(1) - } + options, err := setManagerOptions(configFile) + if err != nil { + setupLog.Error(err, "unable to configure manager's options") + os.Exit(1) } clientConfig := ctrl.GetConfigOrDie() - mgr, err := ctrl.NewManager(clientConfig, options) + mgr, err := ctrl.NewManager(clientConfig, *options) if err != nil { setupLog.Error(err, "unable to start manager") os.Exit(1) @@ -146,7 +137,7 @@ func main() { os.Exit(1) } - if err := initComponents(mgr, anycastTracker, cfg, clientConfig, onlyBPFMode); err != nil { + if err := initComponents(mgr, anycastTracker, cfg, clientConfig, onlyBPFMode, nodeConfigPath); err != nil { setupLog.Error(err, "unable to initialize components") os.Exit(1) } @@ -163,10 +154,32 @@ func main() { } } -func initComponents(mgr manager.Manager, anycastTracker *anycast.Tracker, cfg *config.Config, clientConfig *rest.Config, onlyBPFMode bool) error { +func setManagerOptions(configFile string) (*manager.Options, error) { + var err error + var options manager.Options + if configFile != "" { + options, err = managerconfig.Load(configFile, scheme) + if err != nil { + return nil, fmt.Errorf("unable to load the config file: %w", err) + } + } else { + options = ctrl.Options{Scheme: scheme} + } + + if options.MetricsBindAddress != "0" && options.MetricsBindAddress != "" { + err = initCollectors() + if err != nil { + return nil, fmt.Errorf("unable to initialize metrics collectors: %w", err) + } + } + + return &options, nil +} + +func initComponents(mgr manager.Manager, anycastTracker *anycast.Tracker, cfg *config.Config, clientConfig *rest.Config, onlyBPFMode bool, nodeConfigPath string) error { // Start VRFRouteConfigurationReconciler when we are not running in only BPF mode. if !onlyBPFMode { - if err := setupReconcilers(mgr, anycastTracker); err != nil { + if err := setupReconcilers(mgr, anycastTracker, nodeConfigPath); err != nil { return fmt.Errorf("unable to setup reconcilers: %w", err) } } @@ -225,34 +238,19 @@ func initComponents(mgr manager.Manager, anycastTracker *anycast.Tracker, cfg *c return nil } -func setupReconcilers(mgr manager.Manager, anycastTracker *anycast.Tracker) error { - r, err := reconciler.NewReconciler(mgr.GetClient(), anycastTracker, mgr.GetLogger()) +func setupReconcilers(mgr manager.Manager, anycastTracker *anycast.Tracker, nodeConfigPath string) error { + r, err := reconciler.NewNodeNetworkConfigReconciler(mgr.GetClient(), anycastTracker, mgr.GetLogger(), + nodeConfigPath, frr.NewFRRManager(), nl.NewManager(&nl.Toolkit{})) if err != nil { return fmt.Errorf("unable to create debounced reconciler: %w", err) } - if err = (&controllers.VRFRouteConfigurationReconciler{ - Client: mgr.GetClient(), - Scheme: mgr.GetScheme(), - Reconciler: r, - }).SetupWithManager(mgr); err != nil { - return fmt.Errorf("unable to create VRFRouteConfiguration controller: %w", err) - } - - if err = (&controllers.Layer2NetworkConfigurationReconciler{ - Client: mgr.GetClient(), - Scheme: mgr.GetScheme(), - Reconciler: r, - }).SetupWithManager(mgr); err != nil { - return fmt.Errorf("unable to create Layer2NetworkConfiguration controller: %w", err) - } - - if err = (&controllers.RoutingTableReconciler{ + if err = (&controllers.NodeNetworkConfigReconciler{ Client: mgr.GetClient(), Scheme: mgr.GetScheme(), Reconciler: r, }).SetupWithManager(mgr); err != nil { - return fmt.Errorf("unable to create RoutingTable controller: %w", err) + return fmt.Errorf("unable to create NodeConfig controller: %w", err) } return nil diff --git a/cmd/operator/main.go b/cmd/operator/main.go new file mode 100644 index 00000000..c535f2bd --- /dev/null +++ b/cmd/operator/main.go @@ -0,0 +1,185 @@ +/* +Copyright 2024. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +//nolint:gci +package main + +import ( + "context" + "flag" + "fmt" + "os" + "time" + + "k8s.io/apimachinery/pkg/runtime" + utilruntime "k8s.io/apimachinery/pkg/util/runtime" + clientgoscheme "k8s.io/client-go/kubernetes/scheme" + + networkv1alpha1 "github.com/telekom/das-schiff-network-operator/api/v1alpha1" + "github.com/telekom/das-schiff-network-operator/controllers" + "github.com/telekom/das-schiff-network-operator/pkg/managerconfig" + "github.com/telekom/das-schiff-network-operator/pkg/reconciler" + + // Import all Kubernetes client auth plugins (e.g. Azure, GCP, OIDC, etc.) //nolint:gci + // to ensure that exec-entrypoint and run can make use of them. + _ "k8s.io/client-go/plugin/pkg/client/auth" + ctrl "sigs.k8s.io/controller-runtime" + "sigs.k8s.io/controller-runtime/pkg/log/zap" + "sigs.k8s.io/controller-runtime/pkg/manager" + //nolint:gci // kubebuilder import + //+kubebuilder:scaffold:imports +) + +var ( + scheme = runtime.NewScheme() + setupLog = ctrl.Log.WithName("setup") +) + +func init() { + utilruntime.Must(clientgoscheme.AddToScheme(scheme)) + + utilruntime.Must(networkv1alpha1.AddToScheme(scheme)) + //+kubebuilder:scaffold:scheme +} + +func main() { + var configFile string + var timeout string + var maxUpdating int + flag.StringVar(&configFile, "config", "", + "The controller will load its initial configuration from this file. "+ + "Omit this flag to use the default configuration values. "+ + "Command-line flags override configuration from this file.") + flag.StringVar(&timeout, "timeout", reconciler.DefaultTimeout, + "Timeout for Kubernetes API connections (default: 60s).") + flag.IntVar(&maxUpdating, "max-updating", 1, "Configures how many nodes can be updated simultaneously when rolling update is performed.") + opts := zap.Options{ + Development: true, + } + opts.BindFlags(flag.CommandLine) + flag.Parse() + ctrl.SetLogger(zap.New(zap.UseFlagOptions(&opts))) + + options, err := setMangerOptions(configFile) + if err != nil { + setupLog.Error(err, "error configuring manager options") + os.Exit(1) + } + + clientConfig := ctrl.GetConfigOrDie() + mgr, err := ctrl.NewManager(clientConfig, *options) + if err != nil { + setupLog.Error(err, "unable to start manager") + os.Exit(1) + } + + err = setupReconcilers(mgr, timeout, maxUpdating) + if err != nil { + setupLog.Error(err, "unable to setup reconcilers") + os.Exit(1) + } + + setupLog.Info("starting manager") + if err := mgr.Start(ctrl.SetupSignalHandler()); err != nil { + setupLog.Error(err, "problem running manager") + os.Exit(1) + } +} + +func setupReconcilers(mgr manager.Manager, timeout string, maxUpdating int) error { + timoutVal, err := time.ParseDuration(timeout) + if err != nil { + return fmt.Errorf("error parsing timeout value %s: %w", timeout, err) + } + + cr, err := reconciler.NewConfigReconciler(mgr.GetClient(), mgr.GetLogger().WithName("ConfigReconciler"), timoutVal) + if err != nil { + return fmt.Errorf("unable to create config reconciler reconciler: %w", err) + } + + ncr, err := reconciler.NewNodeConfigReconciler(mgr.GetClient(), mgr.GetLogger().WithName("NodeConfigReconciler"), timoutVal, mgr.GetScheme(), maxUpdating) + if err != nil { + return fmt.Errorf("unable to create node reconciler: %w", err) + } + + initialSetup := newOnLeaderElectionEvent(cr) + if err := mgr.Add(initialSetup); err != nil { + return fmt.Errorf("error adding on leader election event to the manager: %w", err) + } + + if err = (&controllers.ConfigReconciler{ + Client: mgr.GetClient(), + Scheme: mgr.GetScheme(), + Reconciler: cr, + }).SetupWithManager(mgr); err != nil { + return fmt.Errorf("unable to create Config controller: %w", err) + } + + if err = (&controllers.RevisionReconciler{ + Client: mgr.GetClient(), + Scheme: mgr.GetScheme(), + Reconciler: ncr, + }).SetupWithManager(mgr); err != nil { + return fmt.Errorf("unable to create RoutingTable controller: %w", err) + } + + return nil +} + +func setMangerOptions(configFile string) (*manager.Options, error) { + var err error + var options manager.Options + if configFile != "" { + options, err = managerconfig.Load(configFile, scheme) + if err != nil { + return nil, fmt.Errorf("unable to load the config file: %w", err) + } + } else { + options = ctrl.Options{Scheme: scheme} + } + + // force leader election + options.LeaderElection = true + if options.LeaderElectionID == "" { + options.LeaderElectionID = "network-operator" + } + + // force turn off metrics server + options.MetricsBindAddress = "0" + + return &options, nil +} + +type onLeaderElectionEvent struct { + cr *reconciler.ConfigReconciler +} + +func newOnLeaderElectionEvent(cr *reconciler.ConfigReconciler) *onLeaderElectionEvent { + return &onLeaderElectionEvent{ + cr: cr, + } +} + +func (*onLeaderElectionEvent) NeedLeaderElection() bool { + return true +} + +func (e *onLeaderElectionEvent) Start(ctx context.Context) error { + if err := e.cr.ReconcileDebounced(ctx); err != nil { + return fmt.Errorf("error configuring initial configuration revision: %w", err) + } + return nil +} diff --git a/config/manager/manager.yaml b/config/agent/agent.yaml similarity index 92% rename from config/manager/manager.yaml rename to config/agent/agent.yaml index 16497def..a33a2add 100644 --- a/config/manager/manager.yaml +++ b/config/agent/agent.yaml @@ -1,20 +1,20 @@ apiVersion: apps/v1 kind: DaemonSet metadata: - name: worker + name: agent namespace: system labels: - app.kubernetes.io/component: worker + app.kubernetes.io/component: agent spec: selector: matchLabels: - app.kubernetes.io/component: worker + app.kubernetes.io/component: agent template: metadata: annotations: - kubectl.kubernetes.io/default-container: manager + kubectl.kubernetes.io/default-container: agent labels: - app.kubernetes.io/component: worker + app.kubernetes.io/component: agent spec: affinity: nodeAffinity: @@ -37,15 +37,15 @@ spec: hostPID: true containers: - command: - - /manager + - /agent args: [] env: - name: NODE_NAME valueFrom: fieldRef: fieldPath: spec.nodeName - image: controller:latest - name: manager + image: agent:latest + name: agent securityContext: privileged: true runAsUser: 0 diff --git a/config/manager/manager_master.yaml b/config/agent/agent_master.yaml similarity index 96% rename from config/manager/manager_master.yaml rename to config/agent/agent_master.yaml index cd3a4917..65ea2e1d 100644 --- a/config/manager/manager_master.yaml +++ b/config/agent/agent_master.yaml @@ -12,7 +12,7 @@ spec: template: metadata: annotations: - kubectl.kubernetes.io/default-container: manager + kubectl.kubernetes.io/default-container: agent labels: app.kubernetes.io/component: master spec: @@ -40,7 +40,7 @@ spec: hostPID: true containers: - command: - - /manager + - /agent args: - -only-attach-bpf env: @@ -48,8 +48,8 @@ spec: valueFrom: fieldRef: fieldPath: spec.nodeName - image: controller:latest - name: manager + image: agent:latest + name: agent securityContext: privileged: true runAsUser: 0 diff --git a/config/manager/config.yaml b/config/agent/config.yaml similarity index 100% rename from config/manager/config.yaml rename to config/agent/config.yaml diff --git a/config/manager/controller_manager_config.yaml b/config/agent/controller_agent_config.yaml similarity index 100% rename from config/manager/controller_manager_config.yaml rename to config/agent/controller_agent_config.yaml diff --git a/config/manager/kustomization.yaml b/config/agent/kustomization.yaml similarity index 66% rename from config/manager/kustomization.yaml rename to config/agent/kustomization.yaml index e30e1a4f..3e53816d 100644 --- a/config/manager/kustomization.yaml +++ b/config/agent/kustomization.yaml @@ -1,6 +1,6 @@ resources: -- manager.yaml -- manager_master.yaml +- agent.yaml +- agent_master.yaml - service.yaml # - namespace.yaml @@ -9,16 +9,16 @@ generatorOptions: configMapGenerator: - files: - - controller_manager_config.yaml - name: manager-config + - controller_agent_config.yaml + name: agent-config - files: - config.yaml name: config apiVersion: kustomize.config.k8s.io/v1beta1 kind: Kustomization images: -- name: controller - newName: ghcr.io/telekom/das-schiff-network-operator +- name: agent + newName: ghcr.io/telekom/das-schiff-network-operator-agent newTag: latest - name: frr-exporter newName: ghcr.io/telekom/frr-exporter diff --git a/config/manager/namespace.yaml b/config/agent/namespace.yaml similarity index 100% rename from config/manager/namespace.yaml rename to config/agent/namespace.yaml diff --git a/config/manager/service.yaml b/config/agent/service.yaml similarity index 100% rename from config/manager/service.yaml rename to config/agent/service.yaml diff --git a/config/certmanager/kustomization.yaml b/config/certmanager/kustomization.yaml index bebea5a5..ff414e3c 100644 --- a/config/certmanager/kustomization.yaml +++ b/config/certmanager/kustomization.yaml @@ -1,3 +1,6 @@ +# Adds namespace to all resources. +namespace: kube-system + resources: - certificate.yaml diff --git a/config/crd/bases/network.schiff.telekom.de_networkconfigrevisions.yaml b/config/crd/bases/network.schiff.telekom.de_networkconfigrevisions.yaml new file mode 100644 index 00000000..833203b0 --- /dev/null +++ b/config/crd/bases/network.schiff.telekom.de_networkconfigrevisions.yaml @@ -0,0 +1,320 @@ +--- +apiVersion: apiextensions.k8s.io/v1 +kind: CustomResourceDefinition +metadata: + annotations: + controller-gen.kubebuilder.io/version: v0.14.0 + name: networkconfigrevisions.network.schiff.telekom.de +spec: + group: network.schiff.telekom.de + names: + kind: NetworkConfigRevision + listKind: NetworkConfigRevisionList + plural: networkconfigrevisions + shortNames: + - ncr + singular: networkconfigrevision + scope: Cluster + versions: + - additionalPrinterColumns: + - jsonPath: .status.isInvalid + name: Invalid + type: string + - jsonPath: .status.queued + name: Queued + type: integer + - jsonPath: .status.ongoing + name: Ongoing + type: integer + - jsonPath: .status.ready + name: Ready + type: integer + - jsonPath: .status.total + name: Total + type: integer + - jsonPath: .metadata.creationTimestamp + name: Age + type: date + name: v1alpha1 + schema: + openAPIV3Schema: + description: NetworkConfigRevision is the Schema for the node configuration. + properties: + apiVersion: + description: |- + APIVersion defines the versioned schema of this representation of an object. + Servers should convert recognized schemas to the latest internal value, and + may reject unrecognized values. + More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#resources + type: string + kind: + description: |- + Kind is a string value representing the REST resource this object represents. + Servers may infer this from the endpoint the client submits requests to. + Cannot be updated. + In CamelCase. + More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#types-kinds + type: string + metadata: + type: object + spec: + description: NetworkConfigSpec defines the desired state of NetworkConfig. + properties: + config: + description: Config stores global configuration of the nodes. + properties: + layer2: + items: + description: Layer2NetworkConfigurationSpec defines the desired + state of Layer2NetworkConfiguration. + properties: + advertiseNeighbors: + description: If desired network-operator advertises host + routes for local neighbors + type: boolean + anycastGateways: + description: Anycast Gateway to configure on bridge + items: + type: string + type: array + anycastMac: + description: If anycast is desired, specify anycast gateway + MAC address + pattern: (?:[[:xdigit:]]{2}:){5}[[:xdigit:]]{2} + type: string + createMacVLANInterface: + description: Create MACVLAN attach interface + type: boolean + id: + description: VLAN Id of the layer 2 network + type: integer + mtu: + description: Network interface MTU + maximum: 9000 + minimum: 1000 + type: integer + neighSuppression: + description: Enable ARP / ND suppression + type: boolean + nodeSelector: + description: Select nodes to create Layer2 network on + properties: + matchExpressions: + description: matchExpressions is a list of label selector + requirements. The requirements are ANDed. + items: + description: |- + A label selector requirement is a selector that contains values, a key, and an operator that + relates the key and values. + properties: + key: + description: key is the label key that the selector + applies to. + type: string + operator: + description: |- + operator represents a key's relationship to a set of values. + Valid operators are In, NotIn, Exists and DoesNotExist. + type: string + values: + description: |- + values is an array of string values. If the operator is In or NotIn, + the values array must be non-empty. If the operator is Exists or DoesNotExist, + the values array must be empty. This array is replaced during a strategic + merge patch. + items: + type: string + type: array + required: + - key + - operator + type: object + type: array + matchLabels: + additionalProperties: + type: string + description: |- + matchLabels is a map of {key,value} pairs. A single {key,value} in the matchLabels + map is equivalent to an element of matchExpressions, whose key field is "key", the + operator is "In", and the values array contains only "value". The requirements are ANDed. + type: object + type: object + x-kubernetes-map-type: atomic + vni: + description: VXLAN VNI Id for the layer 2 network + maximum: 16777215 + minimum: 1 + type: integer + vrf: + description: VRF to attach Layer2 network to, default if + not set + type: string + required: + - id + - mtu + - vni + type: object + type: array + revision: + description: Revision stores hash of the NodeConfigRevision that + was used to create the NodeNetworkConfig object. + type: string + routingTable: + items: + description: RoutingTableSpec defines the desired state of RoutingTable. + properties: + tableId: + description: TableID is the host table that can be used + to export routes + type: integer + required: + - tableId + type: object + type: array + vrf: + items: + description: VRFRouteConfigurationSpec defines the desired state + of VRFRouteConfiguration. + properties: + aggregate: + description: Aggregate Routes that should be announced + items: + type: string + type: array + community: + description: Community for export, if omitted no community + will be set + type: string + export: + description: Routes exported from the cluster VRF into the + specified VRF + items: + description: VRFRouteConfigurationPrefixItem defines a + prefix item. + properties: + action: + enum: + - permit + - deny + type: string + cidr: + description: CIDR of the leaked network + type: string + ge: + description: Minimum prefix length to be matched + type: integer + le: + description: Maximum prefix length to be matched + type: integer + seq: + description: Sequence in the generated prefix-list, + if omitted will be list index + maximum: 4294967295 + minimum: 1 + type: integer + required: + - action + type: object + maxItems: 4294967295 + type: array + import: + description: Routes imported from this VRF into the cluster + VRF + items: + description: VRFRouteConfigurationPrefixItem defines a + prefix item. + properties: + action: + enum: + - permit + - deny + type: string + cidr: + description: CIDR of the leaked network + type: string + ge: + description: Minimum prefix length to be matched + type: integer + le: + description: Maximum prefix length to be matched + type: integer + seq: + description: Sequence in the generated prefix-list, + if omitted will be list index + maximum: 4294967295 + minimum: 1 + type: integer + required: + - action + type: object + maxItems: 4294967295 + type: array + mtu: + default: 9000 + description: The MTU of the VRF + type: integer + seq: + description: Sequence of the generated route-map, maximum + of 65534 because we sometimes have to set an explicit + default-deny + maximum: 65534 + minimum: 1 + type: integer + vrf: + description: VRF this configuration refers to + maxLength: 12 + type: string + required: + - export + - import + - seq + type: object + type: array + required: + - layer2 + - revision + - routingTable + - vrf + type: object + revision: + description: Revision is a hash of the NetworkConfigRevision object + that is used to identify the particular revision. + type: string + required: + - config + - revision + type: object + status: + properties: + isInvalid: + description: IsInvalid determines if NetworkConfigRevision results + in misconfigured nodes (invalid configuration). + type: boolean + ongoing: + description: Ongoing informs about how many nodes are currently provisioned + with a config derived from the revision. + type: integer + queued: + description: Queued informs about how many nodes are currently waiting + to be provisiined with a config derived from the revision. + type: integer + ready: + description: Ready informs about how many nodes were already provisioned + with a config derived from the revision. + type: integer + total: + description: Total informs about how many nodes in total can be provisiined + with a config derived from the revision. + type: integer + required: + - isInvalid + - ongoing + - queued + - ready + - total + type: object + type: object + served: true + storage: true + subresources: + status: {} diff --git a/config/crd/bases/network.schiff.telekom.de_nodenetworkconfigs.yaml b/config/crd/bases/network.schiff.telekom.de_nodenetworkconfigs.yaml new file mode 100644 index 00000000..4421b173 --- /dev/null +++ b/config/crd/bases/network.schiff.telekom.de_nodenetworkconfigs.yaml @@ -0,0 +1,283 @@ +--- +apiVersion: apiextensions.k8s.io/v1 +kind: CustomResourceDefinition +metadata: + annotations: + controller-gen.kubebuilder.io/version: v0.14.0 + name: nodenetworkconfigs.network.schiff.telekom.de +spec: + group: network.schiff.telekom.de + names: + kind: NodeNetworkConfig + listKind: NodeNetworkConfigList + plural: nodenetworkconfigs + shortNames: + - nnc + singular: nodenetworkconfig + scope: Cluster + versions: + - additionalPrinterColumns: + - jsonPath: .status.configStatus + name: Status + type: string + - jsonPath: .metadata.creationTimestamp + name: Age + type: date + name: v1alpha1 + schema: + openAPIV3Schema: + description: NodeNetworkConfig is the Schema for the node configuration. + properties: + apiVersion: + description: |- + APIVersion defines the versioned schema of this representation of an object. + Servers should convert recognized schemas to the latest internal value, and + may reject unrecognized values. + More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#resources + type: string + kind: + description: |- + Kind is a string value representing the REST resource this object represents. + Servers may infer this from the endpoint the client submits requests to. + Cannot be updated. + In CamelCase. + More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#types-kinds + type: string + metadata: + type: object + spec: + description: NodeNetworkConfigSpec defines the desired state of NodeConfig. + properties: + layer2: + items: + description: Layer2NetworkConfigurationSpec defines the desired + state of Layer2NetworkConfiguration. + properties: + advertiseNeighbors: + description: If desired network-operator advertises host routes + for local neighbors + type: boolean + anycastGateways: + description: Anycast Gateway to configure on bridge + items: + type: string + type: array + anycastMac: + description: If anycast is desired, specify anycast gateway + MAC address + pattern: (?:[[:xdigit:]]{2}:){5}[[:xdigit:]]{2} + type: string + createMacVLANInterface: + description: Create MACVLAN attach interface + type: boolean + id: + description: VLAN Id of the layer 2 network + type: integer + mtu: + description: Network interface MTU + maximum: 9000 + minimum: 1000 + type: integer + neighSuppression: + description: Enable ARP / ND suppression + type: boolean + nodeSelector: + description: Select nodes to create Layer2 network on + properties: + matchExpressions: + description: matchExpressions is a list of label selector + requirements. The requirements are ANDed. + items: + description: |- + A label selector requirement is a selector that contains values, a key, and an operator that + relates the key and values. + properties: + key: + description: key is the label key that the selector + applies to. + type: string + operator: + description: |- + operator represents a key's relationship to a set of values. + Valid operators are In, NotIn, Exists and DoesNotExist. + type: string + values: + description: |- + values is an array of string values. If the operator is In or NotIn, + the values array must be non-empty. If the operator is Exists or DoesNotExist, + the values array must be empty. This array is replaced during a strategic + merge patch. + items: + type: string + type: array + required: + - key + - operator + type: object + type: array + matchLabels: + additionalProperties: + type: string + description: |- + matchLabels is a map of {key,value} pairs. A single {key,value} in the matchLabels + map is equivalent to an element of matchExpressions, whose key field is "key", the + operator is "In", and the values array contains only "value". The requirements are ANDed. + type: object + type: object + x-kubernetes-map-type: atomic + vni: + description: VXLAN VNI Id for the layer 2 network + maximum: 16777215 + minimum: 1 + type: integer + vrf: + description: VRF to attach Layer2 network to, default if not + set + type: string + required: + - id + - mtu + - vni + type: object + type: array + revision: + description: Revision stores hash of the NodeConfigRevision that was + used to create the NodeNetworkConfig object. + type: string + routingTable: + items: + description: RoutingTableSpec defines the desired state of RoutingTable. + properties: + tableId: + description: TableID is the host table that can be used to export + routes + type: integer + required: + - tableId + type: object + type: array + vrf: + items: + description: VRFRouteConfigurationSpec defines the desired state + of VRFRouteConfiguration. + properties: + aggregate: + description: Aggregate Routes that should be announced + items: + type: string + type: array + community: + description: Community for export, if omitted no community will + be set + type: string + export: + description: Routes exported from the cluster VRF into the specified + VRF + items: + description: VRFRouteConfigurationPrefixItem defines a prefix + item. + properties: + action: + enum: + - permit + - deny + type: string + cidr: + description: CIDR of the leaked network + type: string + ge: + description: Minimum prefix length to be matched + type: integer + le: + description: Maximum prefix length to be matched + type: integer + seq: + description: Sequence in the generated prefix-list, if + omitted will be list index + maximum: 4294967295 + minimum: 1 + type: integer + required: + - action + type: object + maxItems: 4294967295 + type: array + import: + description: Routes imported from this VRF into the cluster + VRF + items: + description: VRFRouteConfigurationPrefixItem defines a prefix + item. + properties: + action: + enum: + - permit + - deny + type: string + cidr: + description: CIDR of the leaked network + type: string + ge: + description: Minimum prefix length to be matched + type: integer + le: + description: Maximum prefix length to be matched + type: integer + seq: + description: Sequence in the generated prefix-list, if + omitted will be list index + maximum: 4294967295 + minimum: 1 + type: integer + required: + - action + type: object + maxItems: 4294967295 + type: array + mtu: + default: 9000 + description: The MTU of the VRF + type: integer + seq: + description: Sequence of the generated route-map, maximum of + 65534 because we sometimes have to set an explicit default-deny + maximum: 65534 + minimum: 1 + type: integer + vrf: + description: VRF this configuration refers to + maxLength: 12 + type: string + required: + - export + - import + - seq + type: object + type: array + required: + - layer2 + - revision + - routingTable + - vrf + type: object + status: + description: NodeNetworkConfigStatus defines the observed state of NodeConfig. + properties: + configStatus: + description: ConfigStatus describes provisioning state od the NodeConfig. + Can be either 'provisioning' or 'provisioned'. + type: string + lastUpdate: + description: LastUpdate determines when last update (change) of the + ConfigStatus field took place. + format: date-time + type: string + required: + - configStatus + - lastUpdate + type: object + type: object + served: true + storage: true + subresources: + status: {} diff --git a/config/crd/kustomization.yaml b/config/crd/kustomization.yaml index 2ecd84e3..1b371db8 100644 --- a/config/crd/kustomization.yaml +++ b/config/crd/kustomization.yaml @@ -5,6 +5,8 @@ resources: - bases/network.schiff.telekom.de_vrfrouteconfigurations.yaml - bases/network.schiff.telekom.de_layer2networkconfigurations.yaml - bases/network.schiff.telekom.de_routingtables.yaml +- bases/network.schiff.telekom.de_nodenetworkconfigs.yaml +- bases/network.schiff.telekom.de_networkconfigrevisions.yaml #+kubebuilder:scaffold:crdkustomizeresource # [WEBHOOK] To enable webhook, uncomment all the sections with [WEBHOOK] prefix. diff --git a/config/default/kustomization.yaml b/config/default/kustomization.yaml index 72d9c77d..9e6bca37 100644 --- a/config/default/kustomization.yaml +++ b/config/default/kustomization.yaml @@ -42,7 +42,8 @@ kind: Kustomization resources: - ../crd - ../rbac -- ../manager +- ../agent +- ../operator - ../webhook - ../prometheus labels: diff --git a/config/default/manager_auth_proxy_patch.yaml b/config/default/manager_auth_proxy_patch.yaml index 7a48a5ff..e05af3fa 100644 --- a/config/default/manager_auth_proxy_patch.yaml +++ b/config/default/manager_auth_proxy_patch.yaml @@ -3,7 +3,7 @@ apiVersion: apps/v1 kind: DaemonSet metadata: - name: worker + name: agent namespace: system spec: template: diff --git a/config/default/manager_config_patch.yaml b/config/default/manager_config_patch.yaml index 0defb7cd..43b69476 100644 --- a/config/default/manager_config_patch.yaml +++ b/config/default/manager_config_patch.yaml @@ -1,20 +1,20 @@ apiVersion: apps/v1 kind: DaemonSet metadata: - name: worker + name: agent namespace: system spec: template: spec: containers: - - name: manager + - name: agent args: - - "--config=controller_manager_config.yaml" + - "--config=controller_agent_config.yaml" volumeMounts: - - name: manager-config - mountPath: /controller_manager_config.yaml - subPath: controller_manager_config.yaml + - name: agent-config + mountPath: /controller_agent_config.yaml + subPath: controller_agent_config.yaml volumes: - - name: manager-config + - name: agent-config configMap: - name: manager-config + name: agent-config diff --git a/config/default/manager_master_config_patch.yaml b/config/default/manager_master_config_patch.yaml index 82614073..d8a9b654 100644 --- a/config/default/manager_master_config_patch.yaml +++ b/config/default/manager_master_config_patch.yaml @@ -7,14 +7,14 @@ spec: template: spec: containers: - - name: manager + - name: agent args: - - "--config=controller_manager_config.yaml" + - "--config=controller_agent_config.yaml" volumeMounts: - - name: manager-config - mountPath: /controller_manager_config.yaml - subPath: controller_manager_config.yaml + - name: agent-config + mountPath: /controller_agent_config.yaml + subPath: controller_agent_config.yaml volumes: - - name: manager-config + - name: agent-config configMap: - name: manager-config + name: agent-config diff --git a/config/default/manager_master_metrics_patch.yaml b/config/default/manager_master_metrics_patch.yaml index 3d732a5b..8116aa7a 100644 --- a/config/default/manager_master_metrics_patch.yaml +++ b/config/default/manager_master_metrics_patch.yaml @@ -7,7 +7,7 @@ spec: template: spec: containers: - - name: manager + - name: agent ports: - containerPort: 7080 name: metrics diff --git a/config/default/manager_master_webhook_patch.yaml b/config/default/manager_master_webhook_patch.yaml index 5699ef52..49e1f0f2 100644 --- a/config/default/manager_master_webhook_patch.yaml +++ b/config/default/manager_master_webhook_patch.yaml @@ -7,7 +7,7 @@ spec: template: spec: containers: - - name: manager + - name: agent ports: - containerPort: 7443 name: webhook-server diff --git a/config/default/manager_metrics_patch.yaml b/config/default/manager_metrics_patch.yaml index 93366657..aac2b23c 100644 --- a/config/default/manager_metrics_patch.yaml +++ b/config/default/manager_metrics_patch.yaml @@ -1,13 +1,13 @@ apiVersion: apps/v1 kind: DaemonSet metadata: - name: worker + name: agent namespace: system spec: template: spec: containers: - - name: manager + - name: agent ports: - containerPort: 7080 name: metrics diff --git a/config/default/manager_webhook_patch.yaml b/config/default/manager_webhook_patch.yaml index eebe8977..3f98ef6d 100644 --- a/config/default/manager_webhook_patch.yaml +++ b/config/default/manager_webhook_patch.yaml @@ -1,13 +1,13 @@ apiVersion: apps/v1 kind: DaemonSet metadata: - name: worker + name: agent namespace: system spec: template: spec: containers: - - name: manager + - name: agent ports: - containerPort: 7443 name: webhook-server diff --git a/config/operator/kustomization.yaml b/config/operator/kustomization.yaml new file mode 100644 index 00000000..59de5691 --- /dev/null +++ b/config/operator/kustomization.yaml @@ -0,0 +1,12 @@ +resources: +- operator.yaml + +generatorOptions: + disableNameSuffixHash: true + +apiVersion: kustomize.config.k8s.io/v1beta1 +kind: Kustomization +images: +- name: operator + newName: ghcr.io/telekom/das-schiff-network-opeator + newTag: latest diff --git a/config/operator/operator.yaml b/config/operator/operator.yaml new file mode 100644 index 00000000..d8b8da56 --- /dev/null +++ b/config/operator/operator.yaml @@ -0,0 +1,65 @@ +apiVersion: apps/v1 +kind: Deployment +metadata: + name: operator + namespace: system + labels: + app.kubernetes.io/component: operator +spec: + replicas: 1 + selector: + matchLabels: + app.kubernetes.io/component: operator + template: + metadata: + annotations: + kubectl.kubernetes.io/default-container: operator + labels: + app.kubernetes.io/component: operator + spec: + affinity: + nodeAffinity: + requiredDuringSchedulingIgnoredDuringExecution: + nodeSelectorTerms: + - matchExpressions: + - key: node-role.kubernetes.io/control-plane + operator: DoesNotExist + tolerations: + - effect: NoSchedule + key: node.schiff.telekom.de/uninitialized + operator: Exists + - key: node.cloudprovider.kubernetes.io/uninitialized + value: "true" + effect: NoSchedule + - key: node.kubernetes.io/not-ready + effect: NoSchedule + operator: Exists + hostNetwork: true + hostPID: true + containers: + - command: + - /operator + env: + - name: NODE_NAME + valueFrom: + fieldRef: + fieldPath: spec.nodeName + image: operator:latest + name: operator + resources: + limits: + cpu: 500m + memory: 128Mi + requests: + cpu: 10m + memory: 64Mi + volumeMounts: + - mountPath: /var/state + name: state + serviceAccountName: controller-manager + terminationGracePeriodSeconds: 10 + volumes: + - name: state + hostPath: + path: /var/state + type: DirectoryOrCreate diff --git a/config/rbac/role.yaml b/config/rbac/role.yaml index b58d5a05..de3c9fa8 100644 --- a/config/rbac/role.yaml +++ b/config/rbac/role.yaml @@ -51,6 +51,58 @@ rules: - get - patch - update +- apiGroups: + - network.schiff.telekom.de + resources: + - networkconfigrevisions + verbs: + - create + - delete + - get + - list + - patch + - update + - watch +- apiGroups: + - network.schiff.telekom.de + resources: + - networkconfigrevisions/finalizers + verbs: + - update +- apiGroups: + - network.schiff.telekom.de + resources: + - networkconfigrevisions/status + verbs: + - get + - patch + - update +- apiGroups: + - network.schiff.telekom.de + resources: + - nodenetworkconfigs + verbs: + - create + - delete + - get + - list + - patch + - update + - watch +- apiGroups: + - network.schiff.telekom.de + resources: + - nodenetworkconfigs/finalizers + verbs: + - update +- apiGroups: + - network.schiff.telekom.de + resources: + - nodenetworkconfigs/status + verbs: + - get + - patch + - update - apiGroups: - network.schiff.telekom.de resources: diff --git a/controllers/layer2networkconfiguration_controller.go b/controllers/config_controller.go similarity index 58% rename from controllers/layer2networkconfiguration_controller.go rename to controllers/config_controller.go index 54396c27..c8e32410 100644 --- a/controllers/layer2networkconfiguration_controller.go +++ b/controllers/config_controller.go @@ -19,46 +19,46 @@ package controllers import ( "context" "fmt" - "os" "time" - "github.com/google/go-cmp/cmp" networkv1alpha1 "github.com/telekom/das-schiff-network-operator/api/v1alpha1" - "github.com/telekom/das-schiff-network-operator/pkg/healthcheck" "github.com/telekom/das-schiff-network-operator/pkg/reconciler" - corev1 "k8s.io/api/core/v1" "k8s.io/apimachinery/pkg/runtime" ctrl "sigs.k8s.io/controller-runtime" - "sigs.k8s.io/controller-runtime/pkg/builder" "sigs.k8s.io/controller-runtime/pkg/client" - "sigs.k8s.io/controller-runtime/pkg/event" "sigs.k8s.io/controller-runtime/pkg/handler" "sigs.k8s.io/controller-runtime/pkg/log" - "sigs.k8s.io/controller-runtime/pkg/predicate" "sigs.k8s.io/controller-runtime/pkg/reconcile" ) const requeueTime = 10 * time.Minute -// Layer2NetworkConfigurationReconciler reconciles a Layer2NetworkConfiguration object. -type Layer2NetworkConfigurationReconciler struct { +// ConfigReconciler reconciles a Layer2NetworkConfiguration, RoutingTable and VRFRouteConfiguration objects. +type ConfigReconciler struct { client.Client Scheme *runtime.Scheme - Reconciler *reconciler.Reconciler + Reconciler *reconciler.ConfigReconciler } -//+kubebuilder:rbac:groups=core,resources=nodes,verbs=get;list;update;watch //+kubebuilder:rbac:groups=network.schiff.telekom.de,resources=layer2networkconfigurations,verbs=get;list;watch;create;update;patch;delete //+kubebuilder:rbac:groups=network.schiff.telekom.de,resources=layer2networkconfigurations/status,verbs=get;update;patch //+kubebuilder:rbac:groups=network.schiff.telekom.de,resources=layer2networkconfigurations/finalizers,verbs=update +//+kubebuilder:rbac:groups=network.schiff.telekom.de,resources=routingtables,verbs=get;list;watch;create;update;patch;delete +//+kubebuilder:rbac:groups=network.schiff.telekom.de,resources=routingtables/status,verbs=get;update;patch +//+kubebuilder:rbac:groups=network.schiff.telekom.de,resources=routingtables/finalizers,verbs=update + +//+kubebuilder:rbac:groups=network.schiff.telekom.de,resources=vrfrouteconfigurations,verbs=get;list;watch;create;update;patch;delete +//+kubebuilder:rbac:groups=network.schiff.telekom.de,resources=vrfrouteconfigurations/status,verbs=get;update;patch +//+kubebuilder:rbac:groups=network.schiff.telekom.de,resources=vrfrouteconfigurations/finalizers,verbs=update + // Reconcile is part of the main kubernetes reconciliation loop which aims to // move the current state of the cluster closer to the desired state. // // For more details, check Reconcile and its Result here: // - https://pkg.go.dev/sigs.k8s.io/controller-runtime@v0.11.0/pkg/reconcile -func (r *Layer2NetworkConfigurationReconciler) Reconcile(ctx context.Context, _ ctrl.Request) (ctrl.Result, error) { +func (r *ConfigReconciler) Reconcile(ctx context.Context, _ ctrl.Request) (ctrl.Result, error) { _ = log.FromContext(ctx) r.Reconciler.Reconcile(ctx) @@ -67,21 +67,13 @@ func (r *Layer2NetworkConfigurationReconciler) Reconcile(ctx context.Context, _ } // SetupWithManager sets up the controller with the Manager. -func (r *Layer2NetworkConfigurationReconciler) SetupWithManager(mgr ctrl.Manager) error { - // Create empty request for changes to node - nodesMapFn := handler.EnqueueRequestsFromMapFunc(func(_ context.Context, _ client.Object) []reconcile.Request { return []reconcile.Request{{}} }) - nodePredicates := predicate.Funcs{ - CreateFunc: func(_ event.CreateEvent) bool { return false }, - UpdateFunc: func(e event.UpdateEvent) bool { - return os.Getenv(healthcheck.NodenameEnv) == e.ObjectNew.GetName() && !cmp.Equal(e.ObjectNew.GetLabels(), e.ObjectOld.GetLabels()) - }, - DeleteFunc: func(_ event.DeleteEvent) bool { return false }, - GenericFunc: func(_ event.GenericEvent) bool { return false }, - } - +func (r *ConfigReconciler) SetupWithManager(mgr ctrl.Manager) error { + h := handler.EnqueueRequestsFromMapFunc(func(_ context.Context, _ client.Object) []reconcile.Request { return []ctrl.Request{{}} }) err := ctrl.NewControllerManagedBy(mgr). - For(&networkv1alpha1.Layer2NetworkConfiguration{}). - Watches(&corev1.Node{}, nodesMapFn, builder.WithPredicates(nodePredicates)). + Named("config controller"). + Watches(&networkv1alpha1.Layer2NetworkConfiguration{}, h). + Watches(&networkv1alpha1.RoutingTable{}, h). + Watches(&networkv1alpha1.VRFRouteConfiguration{}, h). Complete(r) if err != nil { return fmt.Errorf("error creating controller: %w", err) diff --git a/controllers/nodenetworkconfig_controller.go b/controllers/nodenetworkconfig_controller.go new file mode 100644 index 00000000..86ec65a3 --- /dev/null +++ b/controllers/nodenetworkconfig_controller.go @@ -0,0 +1,85 @@ +/* +Copyright 2024. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package controllers + +import ( + "context" + "fmt" + "os" + "strings" + + networkv1alpha1 "github.com/telekom/das-schiff-network-operator/api/v1alpha1" + "github.com/telekom/das-schiff-network-operator/pkg/healthcheck" + "github.com/telekom/das-schiff-network-operator/pkg/reconciler" + "k8s.io/apimachinery/pkg/runtime" + ctrl "sigs.k8s.io/controller-runtime" + "sigs.k8s.io/controller-runtime/pkg/builder" + "sigs.k8s.io/controller-runtime/pkg/client" + "sigs.k8s.io/controller-runtime/pkg/event" + "sigs.k8s.io/controller-runtime/pkg/log" + "sigs.k8s.io/controller-runtime/pkg/predicate" +) + +// NodeNetworkConfigReconciler reconciles a NodeNetworkConfig object. +type NodeNetworkConfigReconciler struct { + client.Client + Scheme *runtime.Scheme + + Reconciler *reconciler.NodeNetworkConfigReconciler +} + +//+kubebuilder:rbac:groups=network.schiff.telekom.de,resources=nodenetworkconfigs,verbs=get;list;watch;create;update;patch;delete +//+kubebuilder:rbac:groups=network.schiff.telekom.de,resources=nodenetworkconfigs/status,verbs=get;update;patch +//+kubebuilder:rbac:groups=network.schiff.telekom.de,resources=nodenetworkconfigs/finalizers,verbs=update + +// Reconcile is part of the main kubernetes reconciliation loop which aims to +// move the current state of the cluster closer to the desired state. +// +// For more details, check Reconcile and its Result here: +// - https://pkg.go.dev/sigs.k8s.io/controller-runtime@v0.14.4/pkg/reconcile +func (r *NodeNetworkConfigReconciler) Reconcile(ctx context.Context, _ ctrl.Request) (ctrl.Result, error) { + _ = log.FromContext(ctx) + + // Run ReconcileDebounced through debouncer + if err := r.Reconciler.Reconcile(ctx); err != nil { + return ctrl.Result{}, fmt.Errorf("reconicliation error: %w", err) + } + + return ctrl.Result{RequeueAfter: requeueTime}, nil +} + +// SetupWithManager sets up the controller with the Manager. +func (r *NodeNetworkConfigReconciler) SetupWithManager(mgr ctrl.Manager) error { + namePredicates := predicate.Funcs{ + CreateFunc: func(e event.CreateEvent) bool { + return strings.Contains(e.Object.GetName(), os.Getenv(healthcheck.NodenameEnv)) + }, + UpdateFunc: func(e event.UpdateEvent) bool { + return strings.Contains(e.ObjectNew.GetName(), os.Getenv(healthcheck.NodenameEnv)) + }, + DeleteFunc: func(event.DeleteEvent) bool { return false }, + GenericFunc: func(event.GenericEvent) bool { return false }, + } + + err := ctrl.NewControllerManagedBy(mgr). + For(&networkv1alpha1.NodeNetworkConfig{}, builder.WithPredicates(namePredicates)). + Complete(r) + if err != nil { + return fmt.Errorf("error creating controller: %w", err) + } + return nil +} diff --git a/controllers/routingtable_controller.go b/controllers/revision_controller.go similarity index 55% rename from controllers/routingtable_controller.go rename to controllers/revision_controller.go index 47cf8409..5c12403f 100644 --- a/controllers/routingtable_controller.go +++ b/controllers/revision_controller.go @@ -1,5 +1,5 @@ /* -Copyright 2022. +Copyright 2024. Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -19,45 +19,57 @@ package controllers import ( "context" "fmt" + "time" networkv1alpha1 "github.com/telekom/das-schiff-network-operator/api/v1alpha1" "github.com/telekom/das-schiff-network-operator/pkg/reconciler" + corev1 "k8s.io/api/core/v1" "k8s.io/apimachinery/pkg/runtime" ctrl "sigs.k8s.io/controller-runtime" + "sigs.k8s.io/controller-runtime/pkg/builder" "sigs.k8s.io/controller-runtime/pkg/client" + "sigs.k8s.io/controller-runtime/pkg/handler" "sigs.k8s.io/controller-runtime/pkg/log" ) -// RoutingTableReconciler reconciles a RoutingTable object. -type RoutingTableReconciler struct { +const ( + revisionRequeueTime = 1 * time.Minute +) + +// NetworkConfigRevisionReconciler reconciles a NetworkConfigRevision object. +type RevisionReconciler struct { client.Client Scheme *runtime.Scheme - Reconciler *reconciler.Reconciler + Reconciler *reconciler.ConfigRevisionReconciler } -//+kubebuilder:rbac:groups=network.schiff.telekom.de,resources=routingtables,verbs=get;list;watch;create;update;patch;delete -//+kubebuilder:rbac:groups=network.schiff.telekom.de,resources=routingtables/status,verbs=get;update;patch -//+kubebuilder:rbac:groups=network.schiff.telekom.de,resources=routingtables/finalizers,verbs=update +//+kubebuilder:rbac:groups=core,resources=nodes,verbs=get;list;update;watch + +//+kubebuilder:rbac:groups=network.schiff.telekom.de,resources=networkconfigrevisions,verbs=get;list;watch;create;update;patch;delete +//+kubebuilder:rbac:groups=network.schiff.telekom.de,resources=networkconfigrevisions/status,verbs=get;update;patch +//+kubebuilder:rbac:groups=network.schiff.telekom.de,resources=networkconfigrevisions/finalizers,verbs=update // Reconcile is part of the main kubernetes reconciliation loop which aims to // move the current state of the cluster closer to the desired state. // // For more details, check Reconcile and its Result here: // - https://pkg.go.dev/sigs.k8s.io/controller-runtime@v0.14.4/pkg/reconcile -func (r *RoutingTableReconciler) Reconcile(ctx context.Context, _ ctrl.Request) (ctrl.Result, error) { +func (r *RevisionReconciler) Reconcile(ctx context.Context, _ ctrl.Request) (ctrl.Result, error) { _ = log.FromContext(ctx) // Run ReconcileDebounced through debouncer r.Reconciler.Reconcile(ctx) - return ctrl.Result{RequeueAfter: requeueTime}, nil + return ctrl.Result{RequeueAfter: revisionRequeueTime}, nil } // SetupWithManager sets up the controller with the Manager. -func (r *RoutingTableReconciler) SetupWithManager(mgr ctrl.Manager) error { +func (r *RevisionReconciler) SetupWithManager(mgr ctrl.Manager) error { err := ctrl.NewControllerManagedBy(mgr). - For(&networkv1alpha1.RoutingTable{}). + For(&networkv1alpha1.NetworkConfigRevision{}). + Watches(&corev1.Node{}, &handler.EnqueueRequestForObject{}). + Owns(&networkv1alpha1.NodeNetworkConfig{}, builder.MatchEveryOwner). Complete(r) if err != nil { return fmt.Errorf("error creating controller: %w", err) diff --git a/controllers/vrfrouteconfiguration_controller.go b/controllers/vrfrouteconfiguration_controller.go deleted file mode 100644 index f3d6f153..00000000 --- a/controllers/vrfrouteconfiguration_controller.go +++ /dev/null @@ -1,63 +0,0 @@ -/* -Copyright 2022. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -*/ - -package controllers - -import ( - "context" - "fmt" - - networkv1alpha1 "github.com/telekom/das-schiff-network-operator/api/v1alpha1" - "github.com/telekom/das-schiff-network-operator/pkg/reconciler" - "k8s.io/apimachinery/pkg/runtime" - ctrl "sigs.k8s.io/controller-runtime" - "sigs.k8s.io/controller-runtime/pkg/client" -) - -// VRFRouteConfigurationReconciler reconciles a VRFRouteConfiguration object. -type VRFRouteConfigurationReconciler struct { - client.Client - Scheme *runtime.Scheme - - Reconciler *reconciler.Reconciler -} - -//+kubebuilder:rbac:groups=network.schiff.telekom.de,resources=vrfrouteconfigurations,verbs=get;list;watch;create;update;patch;delete -//+kubebuilder:rbac:groups=network.schiff.telekom.de,resources=vrfrouteconfigurations/status,verbs=get;update;patch -//+kubebuilder:rbac:groups=network.schiff.telekom.de,resources=vrfrouteconfigurations/finalizers,verbs=update - -// Reconcile is part of the main kubernetes reconciliation loop which aims to -// move the current state of the cluster closer to the desired state. -// -// For more details, check Reconcile and its Result here: -// - https://pkg.go.dev/sigs.k8s.io/controller-runtime@v0.10.0/pkg/reconcile -func (r *VRFRouteConfigurationReconciler) Reconcile(ctx context.Context, _ ctrl.Request) (ctrl.Result, error) { - // Run ReconcileDebounced through debouncer - r.Reconciler.Reconcile(ctx) - - return ctrl.Result{RequeueAfter: requeueTime}, nil -} - -// SetupWithManager sets up the controller with the Manager. -func (r *VRFRouteConfigurationReconciler) SetupWithManager(mgr ctrl.Manager) error { - err := ctrl.NewControllerManagedBy(mgr). - For(&networkv1alpha1.VRFRouteConfiguration{}). - Complete(r) - if err != nil { - return fmt.Errorf("error creating controller: %w", err) - } - return nil -} diff --git a/frr-exporter.Dockerfile b/frr-exporter.Dockerfile index 1d4c9be0..bf8a20dd 100644 --- a/frr-exporter.Dockerfile +++ b/frr-exporter.Dockerfile @@ -1,7 +1,7 @@ ARG FRR_VERSION="10.1.0" ARG REGISTRY="quay.io" # Build the manager binary -FROM docker.io/library/golang:1.21-alpine as builder +FROM docker.io/library/golang:1.21-alpine AS builder WORKDIR /workspace # Copy the Go Modules manifests diff --git a/go.mod b/go.mod index 2ac3c3f2..af9a1946 100644 --- a/go.mod +++ b/go.mod @@ -7,7 +7,6 @@ require ( github.com/coreos/go-iptables v0.6.0 github.com/coreos/go-systemd/v22 v22.4.0 github.com/go-logr/logr v1.2.4 - github.com/google/go-cmp v0.5.9 github.com/onsi/ginkgo v1.16.4 github.com/onsi/gomega v1.27.10 github.com/prometheus/client_golang v1.15.1 @@ -23,6 +22,8 @@ require ( sigs.k8s.io/controller-runtime v0.15.1 ) +require github.com/google/go-cmp v0.5.9 // indirect + require ( github.com/beorn7/perks v1.0.1 // indirect github.com/cespare/xxhash/v2 v2.2.0 // indirect diff --git a/pkg/frr/dbus/dbus.go b/pkg/frr/dbus/dbus.go index f7bf9196..9d826d69 100644 --- a/pkg/frr/dbus/dbus.go +++ b/pkg/frr/dbus/dbus.go @@ -7,7 +7,7 @@ import ( "github.com/coreos/go-systemd/v22/dbus" ) -//go:generate mockgen -destination ./mock/mock_frr.go . System,Connection +//go:generate mockgen -destination ./mock/mock_dbus.go . System,Connection type System interface { NewConn(ctx context.Context) (Connection, error) } diff --git a/pkg/frr/dbus/mock/mock_frr.go b/pkg/frr/dbus/mock/mock_dbus.go similarity index 100% rename from pkg/frr/dbus/mock/mock_frr.go rename to pkg/frr/dbus/mock/mock_dbus.go diff --git a/pkg/frr/manager.go b/pkg/frr/manager.go index ffe31008..65c91c9a 100644 --- a/pkg/frr/manager.go +++ b/pkg/frr/manager.go @@ -11,6 +11,7 @@ import ( "github.com/telekom/das-schiff-network-operator/pkg/config" "github.com/telekom/das-schiff-network-operator/pkg/frr/dbus" + "github.com/telekom/das-schiff-network-operator/pkg/nl" ) const defaultPermissions = 0o640 @@ -20,6 +21,16 @@ var ( frrPermissions = fs.FileMode(defaultPermissions) ) +//go:generate mockgen -destination ./mock/mock_frr.go . ManagerInterface +type ManagerInterface interface { + Init(mgmtVrf string) error + ReloadFRR() error + RestartFRR() error + GetStatusFRR() (activeState, subState string, err error) + Configure(in Configuration, nm *nl.Manager) (bool, error) + SetConfigPath(path string) +} + type Manager struct { configTemplate *template.Template @@ -167,6 +178,10 @@ func (m *Manager) GetStatusFRR() (activeState, subState string, err error) { return activeState, subState, nil } +func (m *Manager) SetConfigPath(path string) { + m.ConfigPath = path +} + func (v *VRFConfiguration) ShouldTemplateVRF() bool { return v.VNI != config.SkipVrfTemplateVni } diff --git a/pkg/frr/mock/mock_frr.go b/pkg/frr/mock/mock_frr.go new file mode 100644 index 00000000..da80e82d --- /dev/null +++ b/pkg/frr/mock/mock_frr.go @@ -0,0 +1,121 @@ +// Code generated by MockGen. DO NOT EDIT. +// Source: github.com/telekom/das-schiff-network-operator/pkg/frr (interfaces: ManagerInterface) + +// Package mock_frr is a generated GoMock package. +package mock_frr + +import ( + reflect "reflect" + + frr "github.com/telekom/das-schiff-network-operator/pkg/frr" + nl "github.com/telekom/das-schiff-network-operator/pkg/nl" + gomock "go.uber.org/mock/gomock" +) + +// MockManagerInterface is a mock of ManagerInterface interface. +type MockManagerInterface struct { + ctrl *gomock.Controller + recorder *MockManagerInterfaceMockRecorder +} + +// MockManagerInterfaceMockRecorder is the mock recorder for MockManagerInterface. +type MockManagerInterfaceMockRecorder struct { + mock *MockManagerInterface +} + +// NewMockManagerInterface creates a new mock instance. +func NewMockManagerInterface(ctrl *gomock.Controller) *MockManagerInterface { + mock := &MockManagerInterface{ctrl: ctrl} + mock.recorder = &MockManagerInterfaceMockRecorder{mock} + return mock +} + +// EXPECT returns an object that allows the caller to indicate expected use. +func (m *MockManagerInterface) EXPECT() *MockManagerInterfaceMockRecorder { + return m.recorder +} + +// Configure mocks base method. +func (m *MockManagerInterface) Configure(arg0 frr.Configuration, arg1 *nl.Manager) (bool, error) { + m.ctrl.T.Helper() + ret := m.ctrl.Call(m, "Configure", arg0, arg1) + ret0, _ := ret[0].(bool) + ret1, _ := ret[1].(error) + return ret0, ret1 +} + +// Configure indicates an expected call of Configure. +func (mr *MockManagerInterfaceMockRecorder) Configure(arg0, arg1 interface{}) *gomock.Call { + mr.mock.ctrl.T.Helper() + return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "Configure", reflect.TypeOf((*MockManagerInterface)(nil).Configure), arg0, arg1) +} + +// GetStatusFRR mocks base method. +func (m *MockManagerInterface) GetStatusFRR() (string, string, error) { + m.ctrl.T.Helper() + ret := m.ctrl.Call(m, "GetStatusFRR") + ret0, _ := ret[0].(string) + ret1, _ := ret[1].(string) + ret2, _ := ret[2].(error) + return ret0, ret1, ret2 +} + +// GetStatusFRR indicates an expected call of GetStatusFRR. +func (mr *MockManagerInterfaceMockRecorder) GetStatusFRR() *gomock.Call { + mr.mock.ctrl.T.Helper() + return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "GetStatusFRR", reflect.TypeOf((*MockManagerInterface)(nil).GetStatusFRR)) +} + +// Init mocks base method. +func (m *MockManagerInterface) Init(arg0 string) error { + m.ctrl.T.Helper() + ret := m.ctrl.Call(m, "Init", arg0) + ret0, _ := ret[0].(error) + return ret0 +} + +// Init indicates an expected call of Init. +func (mr *MockManagerInterfaceMockRecorder) Init(arg0 interface{}) *gomock.Call { + mr.mock.ctrl.T.Helper() + return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "Init", reflect.TypeOf((*MockManagerInterface)(nil).Init), arg0) +} + +// ReloadFRR mocks base method. +func (m *MockManagerInterface) ReloadFRR() error { + m.ctrl.T.Helper() + ret := m.ctrl.Call(m, "ReloadFRR") + ret0, _ := ret[0].(error) + return ret0 +} + +// ReloadFRR indicates an expected call of ReloadFRR. +func (mr *MockManagerInterfaceMockRecorder) ReloadFRR() *gomock.Call { + mr.mock.ctrl.T.Helper() + return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "ReloadFRR", reflect.TypeOf((*MockManagerInterface)(nil).ReloadFRR)) +} + +// RestartFRR mocks base method. +func (m *MockManagerInterface) RestartFRR() error { + m.ctrl.T.Helper() + ret := m.ctrl.Call(m, "RestartFRR") + ret0, _ := ret[0].(error) + return ret0 +} + +// RestartFRR indicates an expected call of RestartFRR. +func (mr *MockManagerInterfaceMockRecorder) RestartFRR() *gomock.Call { + mr.mock.ctrl.T.Helper() + return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "RestartFRR", reflect.TypeOf((*MockManagerInterface)(nil).RestartFRR)) +} + +// SetConfigPath mocks base method. +func (m *MockManagerInterface) SetConfigPath(arg0 string) { + m.ctrl.T.Helper() + m.ctrl.Call(m, "SetConfigPath", arg0) +} + +// SetConfigPath indicates an expected call of SetConfigPath. +func (mr *MockManagerInterfaceMockRecorder) SetConfigPath(arg0 interface{}) *gomock.Call { + mr.mock.ctrl.T.Helper() + return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "SetConfigPath", reflect.TypeOf((*MockManagerInterface)(nil).SetConfigPath), arg0) +} diff --git a/pkg/healthcheck/healthcheck.go b/pkg/healthcheck/healthcheck.go index b3084b4c..ec7688de 100644 --- a/pkg/healthcheck/healthcheck.go +++ b/pkg/healthcheck/healthcheck.go @@ -43,8 +43,8 @@ var ( // HealthChecker is a struct that holds data required for networking healthcheck. type HealthChecker struct { - client client.Client - isNetworkingHealthy bool + client client.Client + taintsRemoved bool logr.Logger netConfig *NetHealthcheckConfig toolkit *Toolkit @@ -61,18 +61,18 @@ func NewHealthChecker(clusterClient client.Client, toolkit *Toolkit, netconf *Ne } return &HealthChecker{ - client: clusterClient, - isNetworkingHealthy: false, - Logger: log.Log.WithName("HealthCheck"), - netConfig: netconf, - toolkit: toolkit, - retries: retries, + client: clusterClient, + taintsRemoved: false, + Logger: log.Log.WithName("HealthCheck"), + netConfig: netconf, + toolkit: toolkit, + retries: retries, }, nil } -// IsNetworkingHealthy returns value of isNetworkingHealthly bool. -func (hc *HealthChecker) IsNetworkingHealthy() bool { - return hc.isNetworkingHealthy +// TaintsRemoved returns value of isNetworkingHealthly bool. +func (hc *HealthChecker) TaintsRemoved() bool { + return hc.taintsRemoved } // RemoveTaints removes taint from the node. @@ -102,7 +102,7 @@ func (hc *HealthChecker) RemoveTaints(ctx context.Context) error { } } - hc.isNetworkingHealthy = true + hc.taintsRemoved = true return nil } @@ -137,17 +137,6 @@ func (hc *HealthChecker) CheckInterfaces() error { return nil } -func (hc *HealthChecker) checkInterface(intf string) error { - link, err := hc.toolkit.linkByName(intf) - if err != nil { - return err - } - if link.Attrs().OperState != netlink.OperUp { - return errors.New("link " + intf + " is not up - current state: " + link.Attrs().OperState.String()) - } - return nil -} - // CheckReachability checks if all hosts in Reachability slice are reachable. func (hc *HealthChecker) CheckReachability() error { for _, i := range hc.netConfig.Reachability { @@ -163,6 +152,25 @@ func (hc *HealthChecker) CheckReachability() error { return nil } +// CheckAPIServer checks if Kubernetes Api server is reachable from the pod. +func (hc HealthChecker) CheckAPIServer(ctx context.Context) error { + if err := hc.client.List(ctx, &corev1.NodeList{}); err != nil { + return fmt.Errorf("unable to reach API server: %w", err) + } + return nil +} + +func (hc *HealthChecker) checkInterface(intf string) error { + link, err := hc.toolkit.linkByName(intf) + if err != nil { + return err + } + if link.Attrs().OperState != netlink.OperUp { + return errors.New("link " + intf + " is not up - current state: " + link.Attrs().OperState.String()) + } + return nil +} + func (hc *HealthChecker) checkReachabilityItem(r netReachabilityItem) error { target := r.Host + ":" + strconv.Itoa(r.Port) conn, err := hc.toolkit.tcpDialer.Dial("tcp", target) diff --git a/pkg/healthcheck/healthcheck_test.go b/pkg/healthcheck/healthcheck_test.go index 2ef639be..074ef408 100644 --- a/pkg/healthcheck/healthcheck_test.go +++ b/pkg/healthcheck/healthcheck_test.go @@ -112,10 +112,10 @@ var _ = Describe("RemoveTaints()", func() { hc, err := NewHealthChecker(c, nil, nc) Expect(err).ToNot(HaveOccurred()) Expect(hc).ToNot(BeNil()) - Expect(hc.IsNetworkingHealthy()).To(BeFalse()) + Expect(hc.TaintsRemoved()).To(BeFalse()) err = hc.RemoveTaints(context.Background()) Expect(err).To(HaveOccurred()) - Expect(hc.IsNetworkingHealthy()).To(BeFalse()) + Expect(hc.TaintsRemoved()).To(BeFalse()) }) It("returns error when trying to remove taint (update node)", func() { c := &updateErrorClient{} @@ -123,21 +123,21 @@ var _ = Describe("RemoveTaints()", func() { hc, err := NewHealthChecker(c, nil, nc) Expect(err).ToNot(HaveOccurred()) Expect(hc).ToNot(BeNil()) - Expect(hc.IsNetworkingHealthy()).To(BeFalse()) + Expect(hc.TaintsRemoved()).To(BeFalse()) err = hc.RemoveTaints(context.Background()) Expect(err).To(HaveOccurred()) - Expect(hc.IsNetworkingHealthy()).To(BeFalse()) + Expect(hc.TaintsRemoved()).To(BeFalse()) }) - It("removes taint and set isInitialized true", func() { + It("remove taint and set isInitialized true", func() { c := fake.NewClientBuilder().WithRuntimeObjects(fakeNodes).Build() nc := &NetHealthcheckConfig{} hc, err := NewHealthChecker(c, nil, nc) Expect(err).ToNot(HaveOccurred()) Expect(hc).ToNot(BeNil()) - Expect(hc.IsNetworkingHealthy()).To(BeFalse()) + Expect(hc.TaintsRemoved()).To(BeFalse()) err = hc.RemoveTaints(context.Background()) Expect(err).ToNot(HaveOccurred()) - Expect(hc.IsNetworkingHealthy()).To(BeTrue()) + Expect(hc.TaintsRemoved()).To(BeTrue()) }) }) var _ = Describe("CheckInterfaces()", func() { @@ -147,10 +147,10 @@ var _ = Describe("CheckInterfaces()", func() { hc, err := NewHealthChecker(c, NewHealthCheckToolkit(nil, fakeErrorGetByName, &net.Dialer{Timeout: time.Duration(3)}), nc) Expect(err).ToNot(HaveOccurred()) Expect(hc).ToNot(BeNil()) - Expect(hc.IsNetworkingHealthy()).To(BeFalse()) + Expect(hc.TaintsRemoved()).To(BeFalse()) err = hc.CheckInterfaces() Expect(err).To(HaveOccurred()) - Expect(hc.IsNetworkingHealthy()).To(BeFalse()) + Expect(hc.TaintsRemoved()).To(BeFalse()) }) It("returns error if interface is not up", func() { c := fake.NewClientBuilder().Build() @@ -158,10 +158,10 @@ var _ = Describe("CheckInterfaces()", func() { hc, err := NewHealthChecker(c, NewHealthCheckToolkit(nil, fakeDownGetByName, &net.Dialer{Timeout: time.Duration(3)}), nc) Expect(err).ToNot(HaveOccurred()) Expect(hc).ToNot(BeNil()) - Expect(hc.IsNetworkingHealthy()).To(BeFalse()) + Expect(hc.TaintsRemoved()).To(BeFalse()) err = hc.CheckInterfaces() Expect(err).To(HaveOccurred()) - Expect(hc.IsNetworkingHealthy()).To(BeFalse()) + Expect(hc.TaintsRemoved()).To(BeFalse()) }) It("returns error if all links are up", func() { c := fake.NewClientBuilder().Build() @@ -169,10 +169,10 @@ var _ = Describe("CheckInterfaces()", func() { hc, err := NewHealthChecker(c, NewHealthCheckToolkit(nil, fakeUpGetByName, NewTCPDialer("")), nc) Expect(err).ToNot(HaveOccurred()) Expect(hc).ToNot(BeNil()) - Expect(hc.IsNetworkingHealthy()).To(BeFalse()) + Expect(hc.TaintsRemoved()).To(BeFalse()) err = hc.CheckInterfaces() Expect(err).ToNot(HaveOccurred()) - Expect(hc.IsNetworkingHealthy()).To(BeFalse()) + Expect(hc.TaintsRemoved()).To(BeFalse()) }) }) var _ = Describe("NewTcpDialer()", func() { @@ -181,7 +181,7 @@ var _ = Describe("NewTcpDialer()", func() { hc, err := NewHealthChecker(c, NewHealthCheckToolkit(nil, fakeUpGetByName, NewTCPDialer("")), &NetHealthcheckConfig{}) Expect(err).ToNot(HaveOccurred()) Expect(hc).ToNot(BeNil()) - Expect(hc.IsNetworkingHealthy()).To(BeFalse()) + Expect(hc.TaintsRemoved()).To(BeFalse()) d := hc.toolkit.tcpDialer.(*net.Dialer) Expect(d.Timeout).To(Equal(time.Second * 3)) }) @@ -190,7 +190,7 @@ var _ = Describe("NewTcpDialer()", func() { hc, err := NewHealthChecker(c, NewHealthCheckToolkit(nil, fakeUpGetByName, NewTCPDialer("5")), &NetHealthcheckConfig{}) Expect(err).ToNot(HaveOccurred()) Expect(hc).ToNot(BeNil()) - Expect(hc.IsNetworkingHealthy()).To(BeFalse()) + Expect(hc.TaintsRemoved()).To(BeFalse()) d := hc.toolkit.tcpDialer.(*net.Dialer) Expect(d.Timeout).To(Equal(time.Second * 5)) }) @@ -199,7 +199,7 @@ var _ = Describe("NewTcpDialer()", func() { hc, err := NewHealthChecker(c, NewHealthCheckToolkit(nil, fakeUpGetByName, NewTCPDialer("500ms")), &NetHealthcheckConfig{}) Expect(err).ToNot(HaveOccurred()) Expect(hc).ToNot(BeNil()) - Expect(hc.IsNetworkingHealthy()).To(BeFalse()) + Expect(hc.TaintsRemoved()).To(BeFalse()) d := hc.toolkit.tcpDialer.(*net.Dialer) Expect(d.Timeout).To(Equal(time.Millisecond * 500)) }) @@ -216,7 +216,7 @@ var _ = Describe("CheckReachability()", func() { hc, err := NewHealthChecker(c, NewHealthCheckToolkit(nil, fakeUpGetByName, dialerMock), nc) Expect(err).ToNot(HaveOccurred()) Expect(hc).ToNot(BeNil()) - Expect(hc.IsNetworkingHealthy()).To(BeFalse()) + Expect(hc.TaintsRemoved()).To(BeFalse()) err = hc.CheckReachability() Expect(err).To(HaveOccurred()) }) @@ -229,7 +229,7 @@ var _ = Describe("CheckReachability()", func() { hc, err := NewHealthChecker(c, NewHealthCheckToolkit(nil, fakeUpGetByName, dialerMock), nc) Expect(err).ToNot(HaveOccurred()) Expect(hc).ToNot(BeNil()) - Expect(hc.IsNetworkingHealthy()).To(BeFalse()) + Expect(hc.TaintsRemoved()).To(BeFalse()) err = hc.CheckReachability() Expect(err).ToNot(HaveOccurred()) }) @@ -242,7 +242,7 @@ var _ = Describe("CheckReachability()", func() { hc, err := NewHealthChecker(c, NewHealthCheckToolkit(nil, fakeUpGetByName, dialerMock), nc) Expect(err).ToNot(HaveOccurred()) Expect(hc).ToNot(BeNil()) - Expect(hc.IsNetworkingHealthy()).To(BeFalse()) + Expect(hc.TaintsRemoved()).To(BeFalse()) err = hc.CheckReachability() Expect(err).ToNot(HaveOccurred()) }) @@ -256,11 +256,21 @@ var _ = Describe("CheckReachability()", func() { hc, err := NewHealthChecker(c, NewHealthCheckToolkit(nil, fakeUpGetByName, dialerMock), nc) Expect(err).ToNot(HaveOccurred()) Expect(hc).ToNot(BeNil()) - Expect(hc.IsNetworkingHealthy()).To(BeFalse()) + Expect(hc.TaintsRemoved()).To(BeFalse()) err = hc.CheckReachability() Expect(err).To(HaveOccurred()) }) }) +var _ = Describe("CheckAPIServer()", func() { + It("should return no error", func() { + c := fake.NewClientBuilder().Build() + hc, err := NewHealthChecker(c, NewHealthCheckToolkit(nil, nil, nil), &NetHealthcheckConfig{}) + Expect(err).ToNot(HaveOccurred()) + Expect(hc).ToNot(BeNil()) + err = hc.CheckAPIServer(context.TODO()) + Expect(err).ToNot(HaveOccurred()) + }) +}) func fakeErrorGetByName(_ string) (netlink.Link, error) { return nil, errors.New("Link not found") diff --git a/pkg/managerconfig/managerconfig_test.go b/pkg/managerconfig/managerconfig_test.go index 1ca53f82..c055743e 100644 --- a/pkg/managerconfig/managerconfig_test.go +++ b/pkg/managerconfig/managerconfig_test.go @@ -8,14 +8,10 @@ import ( "k8s.io/apimachinery/pkg/runtime" ) -var _ = BeforeSuite(func() { - -}) - -func TestHealthCheck(t *testing.T) { +func TestManagerConfig(t *testing.T) { RegisterFailHandler(Fail) RunSpecs(t, - "HealthCheck Suite") + "ManagerConfig Suite") } var _ = Describe("Load()", func() { diff --git a/pkg/reconciler/config_reconciler.go b/pkg/reconciler/config_reconciler.go new file mode 100644 index 00000000..533732c1 --- /dev/null +++ b/pkg/reconciler/config_reconciler.go @@ -0,0 +1,192 @@ +package reconciler + +import ( + "context" + "fmt" + "slices" + "time" + + "github.com/go-logr/logr" + "github.com/telekom/das-schiff-network-operator/api/v1alpha1" + "github.com/telekom/das-schiff-network-operator/pkg/debounce" + apierrors "k8s.io/apimachinery/pkg/api/errors" + "sigs.k8s.io/controller-runtime/pkg/client" +) + +const ( + DefaultTimeout = "60s" + DefaultNodeUpdateLimit = 1 +) + +// ConfigReconciler is responsible for creating NodeConfig objects. +type ConfigReconciler struct { + logger logr.Logger + debouncer *debounce.Debouncer + client client.Client + timeout time.Duration +} + +type reconcileConfig struct { + *ConfigReconciler + logr.Logger +} + +// Reconcile starts reconciliation. +func (cr *ConfigReconciler) Reconcile(ctx context.Context) { + cr.debouncer.Debounce(ctx) +} + +// // NewConfigReconciler creates new reconciler that creates NetworkConfigRevision objects. +func NewConfigReconciler(clusterClient client.Client, logger logr.Logger, timeout time.Duration) (*ConfigReconciler, error) { + reconciler := &ConfigReconciler{ + logger: logger, + timeout: timeout, + client: clusterClient, + } + + reconciler.debouncer = debounce.NewDebouncer(reconciler.ReconcileDebounced, defaultDebounceTime, logger) + + return reconciler, nil +} + +func (cr *ConfigReconciler) ReconcileDebounced(ctx context.Context) error { + r := &reconcileConfig{ + ConfigReconciler: cr, + Logger: cr.logger, + } + + cr.logger.Info("fetching config data...") + + timeoutCtx, cancel := context.WithTimeout(ctx, cr.timeout) + defer cancel() + + // get VRFRouteConfiguration, Layer2networkConfiguration and RoutingTable objects + configData, err := r.fetchConfigData(timeoutCtx) + if err != nil { + return fmt.Errorf("error fetching configuration details: %w", err) + } + + // prepare new revision + revision, err := v1alpha1.NewRevision(configData) + if err != nil { + return fmt.Errorf("error preparing new config revision: %w", err) + } + + cr.logger.Info("new revision", "data", revision) + + // get all known revisions + revisions, err := listRevisions(timeoutCtx, cr.client) + if err != nil { + return fmt.Errorf("error listing revisions: %w", err) + } + + // check if revision should be skipped (e.g. it is the same as known invalid revision, or as currently deployed revision) + if shouldSkip(revisions, revision) { + return nil + } + + // create revision object + if err := r.createRevision(timeoutCtx, revision); err != nil { + return fmt.Errorf("error creating revision %s: %w", revision.Spec.Revision[0:10], err) + } + + cr.logger.Info("deployed", "revision", revision.Spec.Revision) + return nil +} + +func shouldSkip(revisions *v1alpha1.NetworkConfigRevisionList, processedRevision *v1alpha1.NetworkConfigRevision) bool { + if len(revisions.Items) > 0 && revisions.Items[0].Spec.Revision == processedRevision.Spec.Revision { + // new revision equals to the last known one - skip (no update is required) + return true + } + + for i := range revisions.Items { + if !revisions.Items[i].Status.IsInvalid { + if revisions.Items[i].Spec.Revision == processedRevision.Spec.Revision { + // new revision equals to the last known valid one - skip (should be already deployed) + return true + } + break + } + } + + for i := range revisions.Items { + if (revisions.Items[i].Spec.Revision == processedRevision.Spec.Revision) && revisions.Items[i].Status.IsInvalid { + // new revision is equal to known invalid revision - skip + return true + } + } + + return false +} + +func (r *reconcileConfig) createRevision(ctx context.Context, revision *v1alpha1.NetworkConfigRevision) error { + if err := r.client.Create(ctx, revision); err != nil { + if !apierrors.IsAlreadyExists(err) { + return fmt.Errorf("error creating NodeConfigRevision: %w", err) + } + if err := r.client.Delete(ctx, revision); err != nil && !apierrors.IsNotFound(err) { + return fmt.Errorf("error deleting old instance of revision %s: %w", revision.Name, err) + } + if err := r.client.Create(ctx, revision); err != nil { + return fmt.Errorf("error creating new instance of revision %s: %w", revision.Name, err) + } + } + return nil +} + +func (r *reconcileConfig) fetchConfigData(ctx context.Context) (*v1alpha1.NodeNetworkConfig, error) { + // get VRFRouteConfiguration objects + l3vnis, err := r.fetchLayer3(ctx) + if err != nil { + return nil, err + } + + // get Layer2networkConfiguration objects + l2vnis, err := r.fetchLayer2(ctx) + if err != nil { + return nil, err + } + + // get RoutingTable objects + taas, err := r.fetchTaas(ctx) + if err != nil { + return nil, err + } + + config := &v1alpha1.NodeNetworkConfig{} + + // discard metadata from previously fetched objects + config.Spec.Layer2 = []v1alpha1.Layer2NetworkConfigurationSpec{} + for i := range l2vnis { + config.Spec.Layer2 = append(config.Spec.Layer2, l2vnis[i].Spec) + } + + config.Spec.Vrf = []v1alpha1.VRFRouteConfigurationSpec{} + for i := range l3vnis { + config.Spec.Vrf = append(config.Spec.Vrf, l3vnis[i].Spec) + } + + config.Spec.RoutingTable = []v1alpha1.RoutingTableSpec{} + for i := range taas { + config.Spec.RoutingTable = append(config.Spec.RoutingTable, taas[i].Spec) + } + + return config, nil +} + +func listRevisions(ctx context.Context, c client.Client) (*v1alpha1.NetworkConfigRevisionList, error) { + revisions := &v1alpha1.NetworkConfigRevisionList{} + if err := c.List(ctx, revisions); err != nil { + return nil, fmt.Errorf("error listing revisions: %w", err) + } + + // sort revisions by creation date ascending (newest first) + if len(revisions.Items) > 0 { + slices.SortFunc(revisions.Items, func(a, b v1alpha1.NetworkConfigRevision) int { + return b.GetCreationTimestamp().Compare(a.GetCreationTimestamp().Time) // newest first + }) + } + + return revisions, nil +} diff --git a/pkg/reconciler/configrevision_reconciler.go b/pkg/reconciler/configrevision_reconciler.go new file mode 100644 index 00000000..09bd7a9f --- /dev/null +++ b/pkg/reconciler/configrevision_reconciler.go @@ -0,0 +1,466 @@ +package reconciler + +import ( + "context" + "errors" + "fmt" + "slices" + "strings" + "time" + + "github.com/go-logr/logr" + "github.com/telekom/das-schiff-network-operator/api/v1alpha1" + "github.com/telekom/das-schiff-network-operator/pkg/debounce" + corev1 "k8s.io/api/core/v1" + apierrors "k8s.io/apimachinery/pkg/api/errors" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/labels" + "k8s.io/apimachinery/pkg/runtime" + "k8s.io/apimachinery/pkg/selection" + "k8s.io/apimachinery/pkg/types" + "k8s.io/client-go/kubernetes/scheme" + "sigs.k8s.io/controller-runtime/pkg/client" + "sigs.k8s.io/controller-runtime/pkg/controller/controllerutil" +) + +const ( + StatusInvalid = "invalid" + StatusProvisioning = "provisioning" + StatusProvisioned = "provisioned" + + controlPlaneLabel = "node-role.kubernetes.io/control-plane" + numOfRefs = 2 + configTimeout = time.Minute * 2 +) + +// ConfigRevisionReconciler is responsible for creating NodeConfig objects. +type ConfigRevisionReconciler struct { + logger logr.Logger + debouncer *debounce.Debouncer + client client.Client + timeout time.Duration + scheme *runtime.Scheme + maxUpdating int +} + +// Reconcile starts reconciliation. +func (crr *ConfigRevisionReconciler) Reconcile(ctx context.Context) { + crr.debouncer.Debounce(ctx) +} + +// // NewNodeConfigReconciler creates new reconciler that creates NodeConfig objects. +func NewNodeConfigReconciler(clusterClient client.Client, logger logr.Logger, timeout time.Duration, s *runtime.Scheme, maxUpdating int) (*ConfigRevisionReconciler, error) { + reconciler := &ConfigRevisionReconciler{ + logger: logger, + timeout: timeout, + client: clusterClient, + scheme: s, + maxUpdating: maxUpdating, + } + + reconciler.debouncer = debounce.NewDebouncer(reconciler.reconcileDebounced, defaultDebounceTime, logger) + + return reconciler, nil +} + +func (crr *ConfigRevisionReconciler) reconcileDebounced(ctx context.Context) error { + revisions, err := listRevisions(ctx, crr.client) + if err != nil { + return fmt.Errorf("error listing revisions: %w", err) + } + + nodes, err := listNodes(ctx, crr.client) + if err != nil { + return fmt.Errorf("error listing nodes: %w", err) + } + + nodeConfigs, err := crr.listConfigs(ctx) + if err != nil { + return fmt.Errorf("error listing configs: %w", err) + } + + totalNodes := len(nodes) + for i := range revisions.Items { + if err := crr.processConfigsForRevision(ctx, nodeConfigs.Items, &revisions.Items[i], totalNodes); err != nil { + return fmt.Errorf("failed to process configs for revision %s: %w", revisions.Items[i].Name, err) + } + } + + revisionToDeploy := getFirstValidRevision(revisions.Items) + + // there is nothing to deploy - skip + if revisionToDeploy == nil { + return nil + } + + nodesToDeploy := getOutdatedNodes(nodes, nodeConfigs.Items, revisionToDeploy) + + if err := crr.updateQueueCounters(ctx, revisions.Items, revisionToDeploy, len(nodesToDeploy)); err != nil { + return fmt.Errorf("failed to update queue counters: %w", err) + } + + if revisionToDeploy.Status.Ongoing < crr.maxUpdating && len(nodesToDeploy) > 0 { + if err := crr.deployNodeConfig(ctx, nodesToDeploy[0], revisionToDeploy); err != nil { + return fmt.Errorf("error deploying node configurations: %w", err) + } + } + + // remove all but last known valid revision + if err := crr.revisionCleanup(ctx); err != nil { + return fmt.Errorf("error cleaning redundant revisions: %w", err) + } + + return nil +} + +func getFirstValidRevision(revisions []v1alpha1.NetworkConfigRevision) *v1alpha1.NetworkConfigRevision { + i := slices.IndexFunc(revisions, func(r v1alpha1.NetworkConfigRevision) bool { + return !r.Status.IsInvalid + }) + if i > -1 { + return &revisions[i] + } + return nil +} + +func (crr *ConfigRevisionReconciler) processConfigsForRevision(ctx context.Context, configs []v1alpha1.NodeNetworkConfig, revision *v1alpha1.NetworkConfigRevision, totalNodes int) error { + configs, err := crr.removeRedundantConfigs(ctx, configs) + if err != nil { + return fmt.Errorf("failed to remove redundant configs: %w", err) + } + ready, ongoing, invalid := getRevisionCounters(configs, revision) + + if err := crr.updateRevisionCounters(ctx, revision, totalNodes, ready, ongoing); err != nil { + return fmt.Errorf("failed to update revision's %s counters: %w", revision.Name, err) + } + + if invalid > 0 { + if err := crr.invalidateRevision(ctx, revision); err != nil { + return fmt.Errorf("faild to invalidate revision %s: %w", revision.Name, err) + } + } + + return nil +} + +func getRevisionCounters(configs []v1alpha1.NodeNetworkConfig, revision *v1alpha1.NetworkConfigRevision) (ready, ongoing, invalid int) { + ready = 0 + ongoing = 0 + invalid = 0 + for i := range configs { + if configs[i].Spec.Revision == revision.Spec.Revision { + switch configs[i].Status.ConfigStatus { + case StatusInvalid: + // Increase 'invalid' counter so we know that there the revision results in invalid configs. + invalid++ + case StatusProvisioning, "": + // Update ongoing counter + ongoing++ + if wasConfigTimeoutReached(&configs[i]) { + // If timout was reached revision is invalid (but still counts as ongoing). + invalid++ + } + case StatusProvisioned: + // Update ready counter + ready++ + } + } + } + return ready, ongoing, invalid +} + +func (crr *ConfigRevisionReconciler) removeRedundantConfigs(ctx context.Context, configs []v1alpha1.NodeNetworkConfig) ([]v1alpha1.NodeNetworkConfig, error) { + cfg := []v1alpha1.NodeNetworkConfig{} + for i := range configs { + // Every NodeNetworkConfig obejct should have 2 owner references - for NodeConfigRevision and for the Node. If there is only one owner reference, + // it means that either node or revision were deleted, so the config itself can be deleted as well. + if len(configs[i].ObjectMeta.OwnerReferences) < numOfRefs { + if err := crr.client.Delete(ctx, &configs[i]); err != nil && !apierrors.IsNotFound(err) { + return nil, fmt.Errorf("error deleting redundant node config - %s: %w", configs[i].Name, err) + } + } else { + cfg = append(cfg, configs[i]) + } + } + return cfg, nil +} + +func (crr *ConfigRevisionReconciler) invalidateRevision(ctx context.Context, revision *v1alpha1.NetworkConfigRevision) error { + revision.Status.IsInvalid = true + if err := crr.client.Status().Update(ctx, revision); err != nil { + return fmt.Errorf("failed to update revision status %s: %w", revision.Name, err) + } + return nil +} + +func wasConfigTimeoutReached(cfg *v1alpha1.NodeNetworkConfig) bool { + return time.Now().After(cfg.Status.LastUpdate.Add(configTimeout)) +} + +func getOutdatedNodes(nodes map[string]*corev1.Node, configs []v1alpha1.NodeNetworkConfig, revision *v1alpha1.NetworkConfigRevision) []*corev1.Node { + for nodeName := range nodes { + for i := range configs { + if configs[i].Name == nodeName && configs[i].Spec.Revision == revision.Spec.Revision { + delete(nodes, nodeName) + break + } + } + } + + nodesToDeploy := []*corev1.Node{} + for _, node := range nodes { + nodesToDeploy = append(nodesToDeploy, node) + } + return nodesToDeploy +} + +func (crr *ConfigRevisionReconciler) updateRevisionCounters(ctx context.Context, revision *v1alpha1.NetworkConfigRevision, total, ready, ongoing int) error { + revision.Status.Total = total + revision.Status.Ready = ready + revision.Status.Ongoing = ongoing + + if err := crr.client.Status().Update(ctx, revision); err != nil { + return fmt.Errorf("error updating revision's status %s: %w", revision.Name, err) + } + return nil +} + +func (crr *ConfigRevisionReconciler) updateQueueCounters(ctx context.Context, revisions []v1alpha1.NetworkConfigRevision, currentRevision *v1alpha1.NetworkConfigRevision, queued int) error { + for i := range revisions { + q := 0 + if revisions[i].Spec.Revision == currentRevision.Spec.Revision { + q = queued + } + revisions[i].Status.Queued = q + if err := crr.client.Status().Update(ctx, &revisions[i]); err != nil { + return fmt.Errorf("error updating queue counter for revision %s: %w", revisions[i].Name, err) + } + } + return nil +} + +func (crr *ConfigRevisionReconciler) revisionCleanup(ctx context.Context) error { + revisions, err := listRevisions(ctx, crr.client) + if err != nil { + return fmt.Errorf("error listing revisions: %w", err) + } + + if len(revisions.Items) > 1 { + nodeConfigs, err := crr.listConfigs(ctx) + if err != nil { + return fmt.Errorf("error listing configs: %w", err) + } + if !revisions.Items[0].Status.IsInvalid && revisions.Items[0].Status.Ready == revisions.Items[0].Status.Total { + for i := 1; i < len(revisions.Items); i++ { + if countReferences(&revisions.Items[i], nodeConfigs.Items) == 0 { + if err := crr.client.Delete(ctx, &revisions.Items[i]); err != nil { + return fmt.Errorf("error deletring revision %s: %w", revisions.Items[i].Name, err) + } + } + } + } + } + + return nil +} + +func countReferences(revision *v1alpha1.NetworkConfigRevision, configs []v1alpha1.NodeNetworkConfig) int { + refCnt := 0 + for j := range configs { + if configs[j].Spec.Revision == revision.Spec.Revision { + refCnt++ + } + } + return refCnt +} + +func (crr *ConfigRevisionReconciler) listConfigs(ctx context.Context) (*v1alpha1.NodeNetworkConfigList, error) { + nodeConfigs := &v1alpha1.NodeNetworkConfigList{} + if err := crr.client.List(ctx, nodeConfigs); err != nil { + return nil, fmt.Errorf("error listing nodeConfigs: %w", err) + } + return nodeConfigs, nil +} + +func (crr *ConfigRevisionReconciler) deployNodeConfig(ctx context.Context, node *corev1.Node, revision *v1alpha1.NetworkConfigRevision) error { + currentConfig := &v1alpha1.NodeNetworkConfig{} + if err := crr.client.Get(ctx, types.NamespacedName{Name: node.Name}, currentConfig); err != nil { + if !apierrors.IsNotFound(err) { + return fmt.Errorf("error getting NodeNetworkConfig object for node %s: %w", node.Name, err) + } + currentConfig = nil + } + + if currentConfig != nil && currentConfig.Spec.Revision == revision.Spec.Revision { + // current config is the same as current revision - skip + return nil + } + + newConfig, err := crr.createConfigForNode(node, revision) + if err != nil { + return fmt.Errorf("error preparing config for node %s: %w", node.Name, err) + } + + if err := crr.deployConfig(ctx, newConfig, currentConfig, node); err != nil { + if errors.Is(err, InvalidConfigError) || errors.Is(err, context.DeadlineExceeded) { + // revision results in invalid config or in context timeout - invalidate revision + revision.Status.IsInvalid = true + if err := crr.client.Status().Update(ctx, revision); err != nil { + return fmt.Errorf("error invalidating revision %s: %w", revision.Name, err) + } + } + return fmt.Errorf("error deploying config for node %s: %w", node.Name, err) + } + return nil +} + +func (crr *ConfigRevisionReconciler) createConfigForNode(node *corev1.Node, revision *v1alpha1.NetworkConfigRevision) (*v1alpha1.NodeNetworkConfig, error) { + // create new config + c := &v1alpha1.NodeNetworkConfig{ + ObjectMeta: metav1.ObjectMeta{ + Name: node.Name, + }, + } + + c.Spec = *revision.Spec.Config.DeepCopy() + c.Spec.Revision = revision.Spec.Revision + c.Name = node.Name + + if err := controllerutil.SetOwnerReference(node, c, scheme.Scheme); err != nil { + return nil, fmt.Errorf("error setting owner references (node): %w", err) + } + + if err := controllerutil.SetOwnerReference(revision, c, crr.scheme); err != nil { + return nil, fmt.Errorf("error setting owner references (revision): %w", err) + } + + // prepare Layer2NetworkConfigurationSpec (l2Spec) for each node. + // Each Layer2NetworkConfigurationSpec from l2Spec has node selector, + // which should be used to add config to proper nodes. + // Each Layer2NetworkConfigurationSpec that don't match the node selector + // is removed. + var err error + c.Spec.Layer2 = slices.DeleteFunc(c.Spec.Layer2, func(s v1alpha1.Layer2NetworkConfigurationSpec) bool { + if err != nil { + // skip if any errors occurred + return false + } + if s.NodeSelector == nil { + // node selector is not defined for the spec. + // Layer2 is global - just continue + return false + } + + // node selector of type v1.labelSelector has to be converted + // to labels.Selector type to be used with controller-runtime client + var selector labels.Selector + selector, err = convertSelector(s.NodeSelector.MatchLabels, s.NodeSelector.MatchExpressions) + if err != nil { + return false + } + + // remove currently processed Layer2NetworkConfigurationSpec if node does not match the selector + return !selector.Matches(labels.Set(node.ObjectMeta.Labels)) + }) + + if err != nil { + return nil, fmt.Errorf("failed to delete redundant Layer2NetworkConfigurationSpec: %w", err) + } + + // set config as next config for the node + return c, nil +} + +func convertSelector(matchLabels map[string]string, matchExpressions []metav1.LabelSelectorRequirement) (labels.Selector, error) { + selector := labels.NewSelector() + var reqs labels.Requirements + + for key, value := range matchLabels { + requirement, err := labels.NewRequirement(key, selection.Equals, []string{value}) + if err != nil { + return nil, fmt.Errorf("error creating MatchLabel requirement: %w", err) + } + reqs = append(reqs, *requirement) + } + + for _, req := range matchExpressions { + lowercaseOperator := selection.Operator(strings.ToLower(string(req.Operator))) + requirement, err := labels.NewRequirement(req.Key, lowercaseOperator, req.Values) + if err != nil { + return nil, fmt.Errorf("error creating MatchExpression requirement: %w", err) + } + reqs = append(reqs, *requirement) + } + selector = selector.Add(reqs...) + + return selector, nil +} + +func (crr *ConfigRevisionReconciler) deployConfig(ctx context.Context, newConfig, currentConfig *v1alpha1.NodeNetworkConfig, node *corev1.Node) error { + var cfg *v1alpha1.NodeNetworkConfig + if currentConfig != nil { + cfg = currentConfig + // there already is config for node - update + cfg.Spec = newConfig.Spec + cfg.ObjectMeta.OwnerReferences = newConfig.ObjectMeta.OwnerReferences + cfg.Name = node.Name + if err := crr.client.Update(ctx, cfg); err != nil { + return fmt.Errorf("error updating config for node %s: %w", node.Name, err) + } + } else { + cfg = newConfig + // there is no config for node - create one + if err := crr.client.Create(ctx, cfg); err != nil { + return fmt.Errorf("error creating config for node %s: %w", node.Name, err) + } + } + + if err := setStatus(ctx, crr.client, cfg, ""); err != nil && !apierrors.IsConflict(err) { + // discard conflict error as it can be encountered if agent will update NodeNetworkConfig status first (race condition) + return fmt.Errorf("error setting config '%s' status: %w", cfg.Name, err) + } + + return nil +} + +func listNodes(ctx context.Context, c client.Client) (map[string]*corev1.Node, error) { + // list all nodes + list := &corev1.NodeList{} + if err := c.List(ctx, list); err != nil { + return nil, fmt.Errorf("unable to list nodes: %w", err) + } + + // discard control-plane and not-ready nodes + nodes := map[string]*corev1.Node{} + for i := range list.Items { + _, isControlPlane := list.Items[i].Labels[controlPlaneLabel] + if !isControlPlane { + // discard nodes that are not in ready state + for j := range list.Items[i].Status.Conditions { + // TODO: Should taint node.kubernetes.io/not-ready be used instead of Conditions? + if list.Items[i].Status.Conditions[j].Type == corev1.NodeReady && + list.Items[i].Status.Conditions[j].Status == corev1.ConditionTrue { + nodes[list.Items[i].Name] = &list.Items[i] + break + } + } + } + } + + return nodes, nil +} + +type ConfigError struct { + Message string +} + +func (e *ConfigError) Error() string { + return e.Message +} + +func (*ConfigError) Is(target error) bool { + _, ok := target.(*ConfigError) + return ok +} + +var InvalidConfigError = &ConfigError{Message: "invalid config"} diff --git a/pkg/reconciler/layer2.go b/pkg/reconciler/layer2.go index 941b28f6..49a26850 100644 --- a/pkg/reconciler/layer2.go +++ b/pkg/reconciler/layer2.go @@ -4,19 +4,12 @@ import ( "context" "fmt" "net" - "os" - "strings" networkv1alpha1 "github.com/telekom/das-schiff-network-operator/api/v1alpha1" - "github.com/telekom/das-schiff-network-operator/pkg/healthcheck" "github.com/telekom/das-schiff-network-operator/pkg/nl" - corev1 "k8s.io/api/core/v1" - "k8s.io/apimachinery/pkg/labels" - "k8s.io/apimachinery/pkg/selection" - "k8s.io/apimachinery/pkg/types" ) -func (r *reconcile) fetchLayer2(ctx context.Context) ([]networkv1alpha1.Layer2NetworkConfiguration, error) { +func (r *reconcileConfig) fetchLayer2(ctx context.Context) ([]networkv1alpha1.Layer2NetworkConfiguration, error) { layer2List := &networkv1alpha1.Layer2NetworkConfigurationList{} err := r.client.List(ctx, layer2List) if err != nil { @@ -24,59 +17,17 @@ func (r *reconcile) fetchLayer2(ctx context.Context) ([]networkv1alpha1.Layer2Ne return nil, fmt.Errorf("error getting list of Layer2s from Kubernetes: %w", err) } - nodeName := os.Getenv(healthcheck.NodenameEnv) - node := &corev1.Node{} - err = r.client.Get(ctx, types.NamespacedName{Name: nodeName}, node) - if err != nil { - r.Logger.Error(err, "error getting local node name") - return nil, fmt.Errorf("error getting local node name: %w", err) - } - l2vnis := []networkv1alpha1.Layer2NetworkConfiguration{} - for i := range layer2List.Items { - item := &layer2List.Items[i] - logger := r.Logger.WithValues("name", item.ObjectMeta.Name, "namespace", item.ObjectMeta.Namespace, "vlan", item.Spec.ID, "vni", item.Spec.VNI) - if item.Spec.NodeSelector != nil { - selector := labels.NewSelector() - var reqs labels.Requirements - - for key, value := range item.Spec.NodeSelector.MatchLabels { - requirement, err := labels.NewRequirement(key, selection.Equals, []string{value}) - if err != nil { - logger.Error(err, "error creating MatchLabel requirement") - return nil, fmt.Errorf("error creating MatchLabel requirement: %w", err) - } - reqs = append(reqs, *requirement) - } - - for _, req := range item.Spec.NodeSelector.MatchExpressions { - lowercaseOperator := selection.Operator(strings.ToLower(string(req.Operator))) - requirement, err := labels.NewRequirement(req.Key, lowercaseOperator, req.Values) - if err != nil { - logger.Error(err, "error creating MatchExpression requirement") - return nil, fmt.Errorf("error creating MatchExpression requirement: %w", err) - } - reqs = append(reqs, *requirement) - } - selector = selector.Add(reqs...) - - if !selector.Matches(labels.Set(node.ObjectMeta.Labels)) { - logger.Info("local node does not match nodeSelector of layer2", "node", nodeName) - continue - } - } - - l2vnis = append(l2vnis, *item) - } + l2vnis = append(l2vnis, layer2List.Items...) - if err := r.checkL2Duplicates(l2vnis); err != nil { + if err := checkL2Duplicates(l2vnis); err != nil { return nil, err } return l2vnis, nil } -func (r *reconcile) reconcileLayer2(l2vnis []networkv1alpha1.Layer2NetworkConfiguration) error { +func (r *reconcileNodeNetworkConfig) reconcileLayer2(l2vnis []networkv1alpha1.Layer2NetworkConfigurationSpec) error { desired, err := r.getDesired(l2vnis) if err != nil { return err @@ -129,7 +80,7 @@ func (r *reconcile) reconcileLayer2(l2vnis []networkv1alpha1.Layer2NetworkConfig return nil } -func (r *reconcile) createL2(info *nl.Layer2Information, anycastTrackerInterfaces *[]int) error { +func (r *reconcileNodeNetworkConfig) createL2(info *nl.Layer2Information, anycastTrackerInterfaces *[]int) error { r.Logger.Info("Creating Layer2", "vlan", info.VlanID, "vni", info.VNI) err := r.netlinkManager.CreateL2(info) if err != nil { @@ -145,7 +96,7 @@ func (r *reconcile) createL2(info *nl.Layer2Information, anycastTrackerInterface return nil } -func (r *reconcile) getDesired(l2vnis []networkv1alpha1.Layer2NetworkConfiguration) ([]nl.Layer2Information, error) { +func (r *reconcileNodeNetworkConfig) getDesired(l2vnis []networkv1alpha1.Layer2NetworkConfigurationSpec) ([]nl.Layer2Information, error) { availableVrfs, err := r.netlinkManager.ListL3() if err != nil { return nil, fmt.Errorf("error loading available VRFs: %w", err) @@ -153,7 +104,7 @@ func (r *reconcile) getDesired(l2vnis []networkv1alpha1.Layer2NetworkConfigurati desired := []nl.Layer2Information{} for i := range l2vnis { - spec := l2vnis[i].Spec + spec := l2vnis[i] var anycastMAC *net.HardwareAddr if mac, err := net.ParseMAC(spec.AnycastMac); err == nil { @@ -162,7 +113,7 @@ func (r *reconcile) getDesired(l2vnis []networkv1alpha1.Layer2NetworkConfigurati anycastGateways, err := r.netlinkManager.ParseIPAddresses(spec.AnycastGateways) if err != nil { - r.Logger.Error(err, "error parsing anycast gateways", "layer", l2vnis[i].ObjectMeta.Name, "gw", spec.AnycastGateways) + r.Logger.Error(err, "error parsing anycast gateways", "gw", spec.AnycastGateways) return nil, fmt.Errorf("error parsing anycast gateways: %w", err) } @@ -175,7 +126,7 @@ func (r *reconcile) getDesired(l2vnis []networkv1alpha1.Layer2NetworkConfigurati } } if !vrfAvailable { - r.Logger.Error(err, "VRF of Layer2 not found on node", "layer", l2vnis[i].ObjectMeta.Name, "vrf", spec.VRF) + r.Logger.Error(err, "VRF of Layer2 not found on node", "vrf", spec.VRF) continue } } @@ -213,7 +164,7 @@ func determineToBeDeleted(existing, desired []nl.Layer2Information) []nl.Layer2I return toDelete } -func (r *reconcile) reconcileExistingLayer(desired, currentConfig *nl.Layer2Information, anycastTrackerInterfaces *[]int) error { +func (r *reconcileNodeNetworkConfig) reconcileExistingLayer(desired, currentConfig *nl.Layer2Information, anycastTrackerInterfaces *[]int) error { r.Logger.Info("Reconciling existing Layer2", "vlan", desired.VlanID, "vni", desired.VNI) err := r.netlinkManager.ReconcileL2(currentConfig, desired) if err != nil { @@ -229,7 +180,7 @@ func (r *reconcile) reconcileExistingLayer(desired, currentConfig *nl.Layer2Info return nil } -func (*reconcile) checkL2Duplicates(configs []networkv1alpha1.Layer2NetworkConfiguration) error { +func checkL2Duplicates(configs []networkv1alpha1.Layer2NetworkConfiguration) error { for i := range configs { for j := i + 1; j < len(configs); j++ { if configs[i].Spec.ID == configs[j].Spec.ID { diff --git a/pkg/reconciler/layer3.go b/pkg/reconciler/layer3.go index bf50b417..55ccc3f2 100644 --- a/pkg/reconciler/layer3.go +++ b/pkg/reconciler/layer3.go @@ -4,6 +4,7 @@ import ( "context" "fmt" "net" + "os" "sort" "strconv" "time" @@ -11,12 +12,14 @@ import ( networkv1alpha1 "github.com/telekom/das-schiff-network-operator/api/v1alpha1" "github.com/telekom/das-schiff-network-operator/pkg/config" "github.com/telekom/das-schiff-network-operator/pkg/frr" + "github.com/telekom/das-schiff-network-operator/pkg/healthcheck" "github.com/telekom/das-schiff-network-operator/pkg/nl" + "k8s.io/apimachinery/pkg/types" ) const defaultSleep = 2 * time.Second -func (r *reconcile) fetchLayer3(ctx context.Context) ([]networkv1alpha1.VRFRouteConfiguration, error) { +func (r *reconcileConfig) fetchLayer3(ctx context.Context) ([]networkv1alpha1.VRFRouteConfiguration, error) { vrfs := &networkv1alpha1.VRFRouteConfigurationList{} err := r.client.List(ctx, vrfs) if err != nil { @@ -27,7 +30,7 @@ func (r *reconcile) fetchLayer3(ctx context.Context) ([]networkv1alpha1.VRFRoute return vrfs.Items, nil } -func (r *reconcile) fetchTaas(ctx context.Context) ([]networkv1alpha1.RoutingTable, error) { +func (r *reconcileConfig) fetchTaas(ctx context.Context) ([]networkv1alpha1.RoutingTable, error) { tables := &networkv1alpha1.RoutingTableList{} err := r.client.List(ctx, tables) if err != nil { @@ -38,8 +41,17 @@ func (r *reconcile) fetchTaas(ctx context.Context) ([]networkv1alpha1.RoutingTab return tables.Items, nil } +func (r *reconcileNodeNetworkConfig) fetchNodeConfig(ctx context.Context) (*networkv1alpha1.NodeNetworkConfig, error) { + cfg := &networkv1alpha1.NodeNetworkConfig{} + err := r.client.Get(ctx, types.NamespacedName{Name: os.Getenv(healthcheck.NodenameEnv)}, cfg) + if err != nil { + return nil, fmt.Errorf("error getting NodeConfig: %w", err) + } + return cfg, nil +} + // nolint: contextcheck // context is not relevant -func (r *reconcile) reconcileLayer3(l3vnis []networkv1alpha1.VRFRouteConfiguration, taas []networkv1alpha1.RoutingTable) error { +func (r *reconcileNodeNetworkConfig) reconcileLayer3(l3vnis []networkv1alpha1.VRFRouteConfigurationSpec, taas []networkv1alpha1.RoutingTableSpec) error { vrfConfigMap, err := r.createVrfConfigMap(l3vnis) if err != nil { return err @@ -96,7 +108,7 @@ func (r *reconcile) reconcileLayer3(l3vnis []networkv1alpha1.VRFRouteConfigurati return nil } -func (r *reconcile) configureFRR(vrfConfigs []frr.VRFConfiguration, reloadTwice bool) error { +func (r *reconcileNodeNetworkConfig) configureFRR(vrfConfigs []frr.VRFConfiguration, reloadTwice bool) error { changed, err := r.frrManager.Configure(frr.Configuration{ VRFs: vrfConfigs, ASN: r.config.ServerASN, @@ -127,7 +139,7 @@ func (r *reconcile) configureFRR(vrfConfigs []frr.VRFConfiguration, reloadTwice return nil } -func (r *reconcile) reloadFRR() error { +func (r *reconcileNodeNetworkConfig) reloadFRR() error { r.Logger.Info("trying to reload FRR config because it changed") err := r.frrManager.ReloadFRR() if err != nil { @@ -143,11 +155,11 @@ func (r *reconcile) reloadFRR() error { return nil } -func (r *reconcile) createVrfConfigMap(l3vnis []networkv1alpha1.VRFRouteConfiguration) (map[string]frr.VRFConfiguration, error) { +func (r *reconcileNodeNetworkConfig) createVrfConfigMap(l3vnis []networkv1alpha1.VRFRouteConfigurationSpec) (map[string]frr.VRFConfiguration, error) { vrfConfigMap := map[string]frr.VRFConfiguration{} for i := range l3vnis { - spec := l3vnis[i].Spec - logger := r.Logger.WithValues("name", l3vnis[i].ObjectMeta.Name, "namespace", l3vnis[i].ObjectMeta.Namespace, "vrf", spec.VRF) + spec := l3vnis[i] + logger := r.Logger.WithValues("vrf", spec.VRF) var vni int var rt string @@ -163,13 +175,13 @@ func (r *reconcile) createVrfConfigMap(l3vnis []networkv1alpha1.VRFRouteConfigur vni = config.SkipVrfTemplateVni } else { err := fmt.Errorf("vrf not in vrf vni map") - r.Logger.Error(err, "VRF does not exist in VRF VNI config, ignoring", "vrf", spec.VRF, "name", l3vnis[i].ObjectMeta.Name, "namespace", l3vnis[i].ObjectMeta.Namespace) + r.Logger.Error(err, "VRF does not exist in VRF VNI config, ignoring", "vrf", spec.VRF) continue } if vni == 0 && vni > 16777215 { err := fmt.Errorf("VNI can not be set to 0") - r.Logger.Error(err, "VNI can not be set to 0, ignoring", "vrf", spec.VRF, "name", l3vnis[i].ObjectMeta.Name, "namespace", l3vnis[i].ObjectMeta.Namespace) + r.Logger.Error(err, "VNI can not be set to 0, ignoring", "vrf", spec.VRF, "name") continue } @@ -183,11 +195,11 @@ func (r *reconcile) createVrfConfigMap(l3vnis []networkv1alpha1.VRFRouteConfigur return vrfConfigMap, nil } -func createVrfFromTaaS(taas []networkv1alpha1.RoutingTable) map[string]frr.VRFConfiguration { +func createVrfFromTaaS(taas []networkv1alpha1.RoutingTableSpec) map[string]frr.VRFConfiguration { vrfConfigMap := map[string]frr.VRFConfiguration{} for i := range taas { - spec := taas[i].Spec + spec := taas[i] name := fmt.Sprintf("taas.%d", spec.TableID) @@ -242,7 +254,7 @@ func createVrfConfig(vrfConfigMap map[string]frr.VRFConfiguration, spec *network return &cfg, nil } -func (r *reconcile) reconcileL3Netlink(vrfConfigs []frr.VRFConfiguration) ([]nl.VRFInformation, bool, error) { +func (r *reconcileNodeNetworkConfig) reconcileL3Netlink(vrfConfigs []frr.VRFConfiguration) ([]nl.VRFInformation, bool, error) { existing, err := r.netlinkManager.ListL3() if err != nil { return nil, false, fmt.Errorf("error listing L3 VRF information: %w", err) @@ -289,7 +301,7 @@ func (r *reconcile) reconcileL3Netlink(vrfConfigs []frr.VRFConfiguration) ([]nl. return toCreate, len(toDelete) > 0, nil } -func (r *reconcile) reconcileTaasNetlink(vrfConfigs []frr.VRFConfiguration) (bool, error) { +func (r *reconcileNodeNetworkConfig) reconcileTaasNetlink(vrfConfigs []frr.VRFConfiguration) (bool, error) { existing, err := r.netlinkManager.ListTaas() if err != nil { return false, fmt.Errorf("error listing TaaS VRF information: %w", err) @@ -308,7 +320,7 @@ func (r *reconcile) reconcileTaasNetlink(vrfConfigs []frr.VRFConfiguration) (boo return deletedInterface, nil } -func (r *reconcile) cleanupTaasNetlink(existing []nl.TaasInformation, intended []frr.VRFConfiguration) (bool, error) { +func (r *reconcileNodeNetworkConfig) cleanupTaasNetlink(existing []nl.TaasInformation, intended []frr.VRFConfiguration) (bool, error) { deletedInterface := false for _, cfg := range existing { stillExists := false @@ -328,7 +340,7 @@ func (r *reconcile) cleanupTaasNetlink(existing []nl.TaasInformation, intended [ return deletedInterface, nil } -func (r *reconcile) createTaasNetlink(existing []nl.TaasInformation, intended []frr.VRFConfiguration) error { +func (r *reconcileNodeNetworkConfig) createTaasNetlink(existing []nl.TaasInformation, intended []frr.VRFConfiguration) error { for i := range intended { alreadyExists := false for _, cfg := range existing { @@ -351,7 +363,7 @@ func (r *reconcile) createTaasNetlink(existing []nl.TaasInformation, intended [] return nil } -func (r *reconcile) reconcileExisting(cfg nl.VRFInformation) error { +func (r *reconcileNodeNetworkConfig) reconcileExisting(cfg nl.VRFInformation) error { if err := r.netlinkManager.EnsureBPFProgram(cfg); err != nil { return fmt.Errorf("error ensuring BPF program on VRF") } diff --git a/pkg/reconciler/nodenetworkconfig_reconciler.go b/pkg/reconciler/nodenetworkconfig_reconciler.go new file mode 100644 index 00000000..cd5f8f34 --- /dev/null +++ b/pkg/reconciler/nodenetworkconfig_reconciler.go @@ -0,0 +1,274 @@ +package reconciler + +import ( + "context" + "encoding/json" + "errors" + "fmt" + "os" + "time" + + "github.com/go-logr/logr" + "github.com/telekom/das-schiff-network-operator/api/v1alpha1" + "github.com/telekom/das-schiff-network-operator/pkg/anycast" + "github.com/telekom/das-schiff-network-operator/pkg/config" + "github.com/telekom/das-schiff-network-operator/pkg/frr" + "github.com/telekom/das-schiff-network-operator/pkg/healthcheck" + "github.com/telekom/das-schiff-network-operator/pkg/nl" + apierrors "k8s.io/apimachinery/pkg/api/errors" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "sigs.k8s.io/controller-runtime/pkg/client" +) + +const ( + defaultDebounceTime = 1 * time.Second + + DefaultNodeConfigPath = "/opt/network-operator/current-config.yaml" + nodeConfigFilePerm = 0o600 +) + +type NodeNetworkConfigReconciler struct { + client client.Client + netlinkManager *nl.Manager + frrManager frr.ManagerInterface + anycastTracker *anycast.Tracker + config *config.Config + logger logr.Logger + healthChecker *healthcheck.HealthChecker + nodeConfig *v1alpha1.NodeNetworkConfig + nodeConfigPath string + dirtyFRRConfig bool +} + +type reconcileNodeNetworkConfig struct { + *NodeNetworkConfigReconciler + logr.Logger +} + +func NewNodeNetworkConfigReconciler(clusterClient client.Client, anycastTracker *anycast.Tracker, logger logr.Logger, nodeConfigPath string, frrManager frr.ManagerInterface, netlinkManager *nl.Manager) (*NodeNetworkConfigReconciler, error) { + reconciler := &NodeNetworkConfigReconciler{ + client: clusterClient, + netlinkManager: netlinkManager, + frrManager: frrManager, + anycastTracker: anycastTracker, + logger: logger, + nodeConfigPath: nodeConfigPath, + } + + cfg, err := config.LoadConfig() + if err != nil { + return nil, fmt.Errorf("error loading config: %w", err) + } + reconciler.config = cfg + + if val := os.Getenv("FRR_CONFIG_FILE"); val != "" { + reconciler.frrManager.SetConfigPath(val) + } + + if err := reconciler.frrManager.Init(cfg.SkipVRFConfig[0]); err != nil { + return nil, fmt.Errorf("error trying to init FRR Manager: %w", err) + } + + nc, err := healthcheck.LoadConfig(healthcheck.NetHealthcheckFile) + if err != nil { + return nil, fmt.Errorf("error loading networking healthcheck config: %w", err) + } + + tcpDialer := healthcheck.NewTCPDialer(nc.Timeout) + reconciler.healthChecker, err = healthcheck.NewHealthChecker(reconciler.client, + healthcheck.NewDefaultHealthcheckToolkit(reconciler.frrManager, tcpDialer), + nc) + if err != nil { + return nil, fmt.Errorf("error creating networking healthchecker: %w", err) + } + + reconciler.nodeConfig, err = readNodeConfig(reconciler.nodeConfigPath) + if !errors.Is(err, os.ErrNotExist) { + return nil, fmt.Errorf("error reading NodeConfig from disk: %w", err) + } + + return reconciler, nil +} + +func (reconciler *NodeNetworkConfigReconciler) Reconcile(ctx context.Context) error { + r := &reconcileNodeNetworkConfig{ + NodeNetworkConfigReconciler: reconciler, + Logger: reconciler.logger, + } + + if err := r.config.ReloadConfig(); err != nil { + return fmt.Errorf("error reloading network-operator config: %w", err) + } + + // get NodeConfig from apiserver + cfg, err := r.fetchNodeConfig(ctx) + if err != nil { + // discard IsNotFound error + if apierrors.IsNotFound(err) { + return nil + } + return err + } + + if r.nodeConfig != nil && r.nodeConfig.Spec.Revision == cfg.Spec.Revision { + // current in-memory conifg has the same revision as the fetched one + // this means that config was already provisioned - skip + if cfg.Status.ConfigStatus != StatusProvisioned { + if err := setStatus(ctx, r.client, cfg, StatusProvisioned); err != nil { + return fmt.Errorf("error setting config status: %w", err) + } + } + return nil + } + + // config is invalid - discard + if cfg.Status.ConfigStatus == StatusInvalid { + return nil + } + if err := r.processConfig(ctx, cfg); err != nil { + return fmt.Errorf("error while processing config: %w", err) + } + + // replace in-memory working config and store it on the disk + reconciler.nodeConfig = cfg + if err := storeNodeConfig(cfg, reconciler.nodeConfigPath); err != nil { + return fmt.Errorf("error saving NodeConfig status: %w", err) + } + + return nil +} + +func (r *reconcileNodeNetworkConfig) processConfig(ctx context.Context, cfg *v1alpha1.NodeNetworkConfig) error { + // set config status as provisioning + if err := setStatus(ctx, r.client, cfg, StatusProvisioning); err != nil { + return fmt.Errorf("error setting config status %s: %w", StatusProvisioning, err) + } + + // reconcile config + if err := doReconciliation(r, cfg); err != nil { + // if reconciliation failed set NodeConfig's status as invalid and restore last known working config + if err := r.invalidateAndRestore(ctx, cfg); err != nil { + return fmt.Errorf("reconciler restoring config: %w", err) + } + + return fmt.Errorf("reconciler error: %w", err) + } + + // check if node is healthly after reconciliation + if err := r.checkHealth(ctx); err != nil { + // if node is not healthly set NodeConfig's status as invalid and restore last known working config + if err := r.invalidateAndRestore(ctx, cfg); err != nil { + return fmt.Errorf("reconciler restoring config: %w", err) + } + + return fmt.Errorf("healthcheck error (previous config restored): %w", err) + } + + // set config status as provisioned (valid) + if err := setStatus(ctx, r.client, cfg, StatusProvisioned); err != nil { + return fmt.Errorf("error setting config status %s: %w", StatusProvisioned, err) + } + + return nil +} + +func setStatus(ctx context.Context, c client.Client, cfg *v1alpha1.NodeNetworkConfig, status string) error { + cfg.Status.ConfigStatus = status + cfg.Status.LastUpdate = metav1.Now() + if err := c.Status().Update(ctx, cfg); err != nil { + return fmt.Errorf("error updating NodeConfig status: %w", err) + } + return nil +} + +func (r *reconcileNodeNetworkConfig) invalidateAndRestore(ctx context.Context, cfg *v1alpha1.NodeNetworkConfig) error { + if err := setStatus(ctx, r.client, cfg, StatusInvalid); err != nil { + return fmt.Errorf("error invalidating config: %w", err) + } + + // try to restore previously known good NodeConfig + if err := r.restoreNodeConfig(); err != nil { + return fmt.Errorf("error restoring NodeConfig: %w", err) + } + + return nil +} + +func doReconciliation(r *reconcileNodeNetworkConfig, nodeCfg *v1alpha1.NodeNetworkConfig) error { + r.logger.Info("config to reconcile", "NodeConfig", *nodeCfg) + l3vnis := nodeCfg.Spec.Vrf + l2vnis := nodeCfg.Spec.Layer2 + taas := nodeCfg.Spec.RoutingTable + + if err := r.reconcileLayer3(l3vnis, taas); err != nil { + return err + } + if err := r.reconcileLayer2(l2vnis); err != nil { + return err + } + + return nil +} + +func (r *reconcileNodeNetworkConfig) restoreNodeConfig() error { + if r.nodeConfig == nil { + return nil + } + if err := doReconciliation(r, r.nodeConfig); err != nil { + return fmt.Errorf("error restoring configuration: %w", err) + } + + r.logger.Info("restored last known valid config") + + return nil +} + +func readNodeConfig(path string) (*v1alpha1.NodeNetworkConfig, error) { + cfg, err := os.ReadFile(path) + if err != nil { + return nil, fmt.Errorf("error reading NodeConfig: %w", err) + } + + nodeConfig := &v1alpha1.NodeNetworkConfig{} + if err := json.Unmarshal(cfg, nodeConfig); err != nil { + return nil, fmt.Errorf("error unmarshalling NodeConfig: %w", err) + } + + return nodeConfig, nil +} + +func storeNodeConfig(cfg *v1alpha1.NodeNetworkConfig, path string) error { + // save working config + c, err := json.MarshalIndent(*cfg, "", " ") + if err != nil { + panic(err) + } + + if err = os.WriteFile(path, c, nodeConfigFilePerm); err != nil { + return fmt.Errorf("error saving NodeConfig status: %w", err) + } + + return nil +} + +func (reconciler *NodeNetworkConfigReconciler) checkHealth(ctx context.Context) error { + _, err := reconciler.healthChecker.IsFRRActive() + if err != nil { + return fmt.Errorf("error checking FRR status: %w", err) + } + if err := reconciler.healthChecker.CheckInterfaces(); err != nil { + return fmt.Errorf("error checking network interfaces: %w", err) + } + if err := reconciler.healthChecker.CheckReachability(); err != nil { + return fmt.Errorf("error checking network reachability: %w", err) + } + if err := reconciler.healthChecker.CheckAPIServer(ctx); err != nil { + return fmt.Errorf("error checking API Server reachability: %w", err) + } + if !reconciler.healthChecker.TaintsRemoved() { + if err := reconciler.healthChecker.RemoveTaints(ctx); err != nil { + return fmt.Errorf("error removing taint from the node: %w", err) + } + } + return nil +} diff --git a/pkg/reconciler/reconciler.go b/pkg/reconciler/reconciler.go deleted file mode 100644 index 9f7dd945..00000000 --- a/pkg/reconciler/reconciler.go +++ /dev/null @@ -1,132 +0,0 @@ -package reconciler - -import ( - "context" - "fmt" - "os" - "time" - - "github.com/go-logr/logr" - "github.com/telekom/das-schiff-network-operator/pkg/anycast" - "github.com/telekom/das-schiff-network-operator/pkg/config" - "github.com/telekom/das-schiff-network-operator/pkg/debounce" - "github.com/telekom/das-schiff-network-operator/pkg/frr" - "github.com/telekom/das-schiff-network-operator/pkg/healthcheck" - "github.com/telekom/das-schiff-network-operator/pkg/nl" - "sigs.k8s.io/controller-runtime/pkg/client" -) - -const defaultDebounceTime = 20 * time.Second - -type Reconciler struct { - client client.Client - netlinkManager *nl.Manager - frrManager *frr.Manager - anycastTracker *anycast.Tracker - config *config.Config - logger logr.Logger - healthChecker *healthcheck.HealthChecker - - debouncer *debounce.Debouncer - - dirtyFRRConfig bool -} - -type reconcile struct { - *Reconciler - logr.Logger -} - -func NewReconciler(clusterClient client.Client, anycastTracker *anycast.Tracker, logger logr.Logger) (*Reconciler, error) { - reconciler := &Reconciler{ - client: clusterClient, - netlinkManager: nl.NewManager(&nl.Toolkit{}), - frrManager: frr.NewFRRManager(), - anycastTracker: anycastTracker, - logger: logger, - } - - reconciler.debouncer = debounce.NewDebouncer(reconciler.reconcileDebounced, defaultDebounceTime, logger) - - cfg, err := config.LoadConfig() - if err != nil { - return nil, fmt.Errorf("error loading config: %w", err) - } - reconciler.config = cfg - - if val := os.Getenv("FRR_CONFIG_FILE"); val != "" { - reconciler.frrManager.ConfigPath = val - } - if err := reconciler.frrManager.Init(cfg.SkipVRFConfig[0]); err != nil { - return nil, fmt.Errorf("error trying to init FRR Manager: %w", err) - } - - nc, err := healthcheck.LoadConfig(healthcheck.NetHealthcheckFile) - if err != nil { - return nil, fmt.Errorf("error loading networking healthcheck config: %w", err) - } - - tcpDialer := healthcheck.NewTCPDialer(nc.Timeout) - reconciler.healthChecker, err = healthcheck.NewHealthChecker(reconciler.client, - healthcheck.NewDefaultHealthcheckToolkit(reconciler.frrManager, tcpDialer), - nc) - if err != nil { - return nil, fmt.Errorf("error creating netwokring healthchecker: %w", err) - } - - return reconciler, nil -} - -func (reconciler *Reconciler) Reconcile(ctx context.Context) { - reconciler.debouncer.Debounce(ctx) -} - -func (reconciler *Reconciler) reconcileDebounced(ctx context.Context) error { - r := &reconcile{ - Reconciler: reconciler, - Logger: reconciler.logger, - } - - r.Logger.Info("Reloading config") - if err := r.config.ReloadConfig(); err != nil { - return fmt.Errorf("error reloading network-operator config: %w", err) - } - - l3vnis, err := r.fetchLayer3(ctx) - if err != nil { - return err - } - l2vnis, err := r.fetchLayer2(ctx) - if err != nil { - return err - } - taas, err := r.fetchTaas(ctx) - if err != nil { - return err - } - - if err := r.reconcileLayer3(l3vnis, taas); err != nil { - return err - } - if err := r.reconcileLayer2(l2vnis); err != nil { - return err - } - - if !reconciler.healthChecker.IsNetworkingHealthy() { - _, err := reconciler.healthChecker.IsFRRActive() - if err != nil { - return fmt.Errorf("error checking FRR status: %w", err) - } - if err = reconciler.healthChecker.CheckInterfaces(); err != nil { - return fmt.Errorf("error checking network interfaces: %w", err) - } - if err = reconciler.healthChecker.CheckReachability(); err != nil { - return fmt.Errorf("error checking network reachability: %w", err) - } - if err = reconciler.healthChecker.RemoveTaints(ctx); err != nil { - return fmt.Errorf("error removing taint from the node: %w", err) - } - } - - return nil -} diff --git a/pkg/reconciler/reconciler_test.go b/pkg/reconciler/reconciler_test.go new file mode 100644 index 00000000..d66adf73 --- /dev/null +++ b/pkg/reconciler/reconciler_test.go @@ -0,0 +1,392 @@ +package reconciler + +import ( + "context" + "encoding/json" + "fmt" + "os" + "testing" + "time" + + "github.com/go-logr/logr" + . "github.com/onsi/ginkgo" + . "github.com/onsi/gomega" + "github.com/telekom/das-schiff-network-operator/api/v1alpha1" + "github.com/telekom/das-schiff-network-operator/pkg/config" + mock_frr "github.com/telekom/das-schiff-network-operator/pkg/frr/mock" + "github.com/telekom/das-schiff-network-operator/pkg/healthcheck" + "github.com/telekom/das-schiff-network-operator/pkg/nl" + mock_nl "github.com/telekom/das-schiff-network-operator/pkg/nl/mock" + "github.com/vishvananda/netlink" + "go.uber.org/mock/gomock" + "gopkg.in/yaml.v2" + corev1 "k8s.io/api/core/v1" + "k8s.io/apimachinery/pkg/runtime" + "sigs.k8s.io/controller-runtime/pkg/client" + "sigs.k8s.io/controller-runtime/pkg/client/fake" +) + +var ( + fakeNCRJSON = `{ + "apiVersion": "v1", + "items": [ + { + "apiVersion": "network.schiff.telekom.de/v1alpha1", + "kind": "NetworkConfigRevision", + "metadata": { + "creationTimestamp": "2024-07-11T15:16:00Z", + "generation": 1, + "name": "19dad916c7", + "resourceVersion": "91836", + "uid": "797e11da-1d60-4263-b2ad-fe0a73d761b7" + }, + "spec": { + "config": { + "layer2": [ + { + "id": 1, + "mtu": 1500, + "nodeSelector": { + "matchLabels": { + "worker": "true" + } + }, + "vni": 1 + } + ], + "revision": "", + "routingTable": [], + "vrf": [] + }, + "revision": "19dad916c701bc0aeebd14f66bae591f402cabd31cd9b150b87bca710abe3b33" + }, + "status": { + "isInvalid": false + } + } + ], + "kind": "List", + "metadata": { + "resourceVersion": "" + } + }` + + fakeNodesJSON = `{"items":[ + { + "apiVersion": "v1", + "kind": "Node", + "metadata": { + "name": "kind-worker" + }, + "status": { + "conditions": [ + { + "status": "True", + "type": "Ready" + } + ] + } + } + ]}` + + fakeNNCJSON = ` + { + "apiVersion": "v1", + "items": [ + { + "apiVersion": "network.schiff.telekom.de/v1alpha1", + "kind": "NodeNetworkConfig", + "metadata": { + "creationTimestamp": "2024-07-11T15:14:32Z", + "generation": 4, + "name": "test-node", + "ownerReferences": [ + { + "apiVersion": "v1", + "kind": "Node", + "name": "test-node", + "uid": "a616532b-e188-41d7-a0f3-6f17cdfa50b8" + } + ], + "resourceVersion": "97276", + "uid": "b80f17a1-d68e-4e6d-b0cb-e2fdc97b0363" + }, + "spec": { + "layer2": [ + { + "id": 1, + "mtu": 1500, + "vni": 1 + } + ], + "revision": "19dad916c701bc0aeebd14f66bae591f402cabd31cd9b150b87bca710abe3b33", + "routingTable": [], + "vrf": [] + }, + "status": { + "configStatus": "provisioned" + } + } + ], + "kind": "List", + "metadata": { + "resourceVersion": "" + } + } +` + + mockctrl *gomock.Controller + tmpDir string + testConfig string +) + +const ( + operatorConfigEnv = "OPERATOR_CONFIG" + dummy = "dummy" +) + +var _ = BeforeSuite(func() { + var err error + tmpDir, err = os.MkdirTemp(".", "testdata") + Expect(err).ToNot(HaveOccurred()) + testConfig = tmpDir + "/config.yaml" + + config := config.Config{ + SkipVRFConfig: []string{dummy}, + } + + configData, err := yaml.Marshal(config) + Expect(err).ToNot(HaveOccurred()) + + err = os.WriteFile(testConfig, configData, 0o600) + Expect(err).ToNot(HaveOccurred()) + err = os.Setenv(operatorConfigEnv, testConfig) + Expect(err).ToNot(HaveOccurred()) + err = os.Setenv(healthcheck.NodenameEnv, "test-node") + Expect(err).ToNot(HaveOccurred()) +}) + +var _ = AfterSuite(func() { + err := os.RemoveAll(tmpDir) + Expect(err).ToNot(HaveOccurred()) + err = os.Unsetenv(operatorConfigEnv) + Expect(err).ToNot(HaveOccurred()) + err = os.Unsetenv(healthcheck.NodenameEnv) + Expect(err).ToNot(HaveOccurred()) +}) + +func TestReconciler(t *testing.T) { + RegisterFailHandler(Fail) + mockctrl = gomock.NewController(t) + defer mockctrl.Finish() + RunSpecs(t, + "Reconciler Suite") +} + +var _ = Describe("ConfigReconciler", func() { + Context("NewConfigReconciler() should", func() { + It("return new config reconciler", func() { + c := createFullClient() + r, err := NewConfigReconciler(c, logr.New(nil), time.Millisecond*100) + Expect(r).ToNot(BeNil()) + Expect(err).ToNot(HaveOccurred()) + }) + }) + Context("ReconcileDebounced() should", func() { + It("return no error", func() { + c := createFullClient() + r, err := NewConfigReconciler(c, logr.New(nil), time.Millisecond*100) + Expect(r).ToNot(BeNil()) + Expect(err).ToNot(HaveOccurred()) + err = r.ReconcileDebounced(context.TODO()) + Expect(err).ToNot(HaveOccurred()) + }) + }) +}) + +var _ = Describe("NodeConfigReconciler", func() { + Context("NewNodeConfigReconciler() should", func() { + It("return new node config reconciler", func() { + c := createClient() + r, err := NewNodeConfigReconciler(c, logr.New(nil), time.Millisecond*100, runtime.NewScheme(), 1) + Expect(r).ToNot(BeNil()) + Expect(err).ToNot(HaveOccurred()) + }) + }) + Context("reconcileDebaunced() should", func() { + It("return no error if there is nothing to deploy", func() { + c := createClient() + r, err := NewNodeConfigReconciler(c, logr.New(nil), time.Millisecond*100, runtime.NewScheme(), 1) + Expect(r).ToNot(BeNil()) + Expect(err).ToNot(HaveOccurred()) + err = r.reconcileDebounced(context.TODO()) + Expect(err).ToNot(HaveOccurred()) + }) + It("return error if cannot set revision isInvalid status to false", func() { + fakeNCR := &v1alpha1.NetworkConfigRevisionList{} + err := json.Unmarshal([]byte(fakeNCRJSON), fakeNCR) + Expect(err).ShouldNot(HaveOccurred()) + c := createClient(fakeNCR) + r, err := NewNodeConfigReconciler(c, logr.New(nil), time.Millisecond*100, runtime.NewScheme(), 1) + Expect(r).ToNot(BeNil()) + Expect(err).ToNot(HaveOccurred()) + err = r.reconcileDebounced(context.TODO()) + Expect(err).To(HaveOccurred()) + }) + It("no error if NodeConfigRevision deployed successfully", func() { + c := createFullClient() + r, err := NewNodeConfigReconciler(c, logr.New(nil), time.Millisecond*100, runtime.NewScheme(), 1) + Expect(r).ToNot(BeNil()) + Expect(err).ToNot(HaveOccurred()) + err = r.reconcileDebounced(context.TODO()) + Expect(err).ToNot(HaveOccurred()) + }) + It("return error on context timeout", func() { + fakeNCR := &v1alpha1.NetworkConfigRevisionList{} + err := json.Unmarshal([]byte(fakeNCRJSON), fakeNCR) + Expect(err).ShouldNot(HaveOccurred()) + fakeNodes := &corev1.NodeList{} + err = json.Unmarshal([]byte(fakeNodesJSON), fakeNodes) + Expect(err).ToNot(HaveOccurred()) + fakeNNC := &v1alpha1.NodeNetworkConfigList{} + err = json.Unmarshal([]byte(fakeNNCJSON), fakeNNC) + Expect(err).ShouldNot(HaveOccurred()) + + c := createClientWithStatus(&fakeNCR.Items[0], &fakeNNC.Items[0], fakeNCR, fakeNNC, fakeNodes) + r, err := NewNodeConfigReconciler(c, logr.New(nil), time.Millisecond*100, runtime.NewScheme(), 1) + Expect(r).ToNot(BeNil()) + Expect(err).ToNot(HaveOccurred()) + err = r.reconcileDebounced(context.TODO()) + Expect(err).To(HaveOccurred()) + }) + }) +}) + +var _ = Describe("NodeNetworkConfigReconciler", func() { + Context("NewNodeNetworkConfigReconciler() should", func() { + It("return error if cannot init FRR Manager", func() { + frrManagerMock := mock_frr.NewMockManagerInterface(mockctrl) + c := createClient() + frrManagerMock.EXPECT().Init(gomock.Any()).Return(fmt.Errorf("init error")) + r, err := NewNodeNetworkConfigReconciler(c, nil, logr.New(nil), "", + frrManagerMock, nl.NewManager(mock_nl.NewMockToolkitInterface(mockctrl))) + Expect(err).To(HaveOccurred()) + Expect(r).To(BeNil()) + }) + It("create new reconciler", func() { + frrManagerMock := mock_frr.NewMockManagerInterface(mockctrl) + c := createClient() + frrManagerMock.EXPECT().Init(gomock.Any()).Return(nil) + r, err := NewNodeNetworkConfigReconciler(c, nil, logr.New(nil), "", + frrManagerMock, nl.NewManager(mock_nl.NewMockToolkitInterface(mockctrl))) + Expect(err).ToNot(HaveOccurred()) + Expect(r).ToNot(BeNil()) + }) + }) + Context("Reconcile() should", func() { + It("return no error if there is no config to reconcile", func() { + frrManagerMock := mock_frr.NewMockManagerInterface(mockctrl) + c := createClient() + frrManagerMock.EXPECT().Init(gomock.Any()).Return(nil) + r, err := NewNodeNetworkConfigReconciler(c, nil, logr.New(nil), "", + frrManagerMock, nl.NewManager(mock_nl.NewMockToolkitInterface(mockctrl))) + Expect(err).ToNot(HaveOccurred()) + Expect(r).ToNot(BeNil()) + err = r.Reconcile(context.TODO()) + Expect(err).ToNot(HaveOccurred()) + }) + It("return no error if there is no config to reconcile", func() { + frrManagerMock := mock_frr.NewMockManagerInterface(mockctrl) + c := createClient() + frrManagerMock.EXPECT().Init(gomock.Any()).Return(nil) + r, err := NewNodeNetworkConfigReconciler(c, nil, logr.New(nil), "", + frrManagerMock, nl.NewManager(mock_nl.NewMockToolkitInterface(mockctrl))) + Expect(err).ToNot(HaveOccurred()) + Expect(r).ToNot(BeNil()) + err = r.Reconcile(context.TODO()) + Expect(err).ToNot(HaveOccurred()) + }) + It("return error if cannot configure FRR", func() { + frrManagerMock := mock_frr.NewMockManagerInterface(mockctrl) + netlinkMock := mock_nl.NewMockToolkitInterface(mockctrl) + netlinkMock.EXPECT().LinkList().Return([]netlink.Link{}, nil) + + c := createFullClient() + + frrManagerMock.EXPECT().Init(gomock.Any()).Return(nil) + frrManagerMock.EXPECT().Configure(gomock.Any(), + gomock.Any()).Return(false, fmt.Errorf("configuration error")) + r, err := NewNodeNetworkConfigReconciler(c, nil, logr.New(nil), "", + frrManagerMock, nl.NewManager(netlinkMock)) + Expect(err).ToNot(HaveOccurred()) + Expect(r).ToNot(BeNil()) + err = r.Reconcile(context.TODO()) + Expect(err).To(HaveOccurred()) + }) + It("return error if failed to reload FRR", func() { + frrManagerMock := mock_frr.NewMockManagerInterface(mockctrl) + netlinkMock := mock_nl.NewMockToolkitInterface(mockctrl) + netlinkMock.EXPECT().LinkList().Return([]netlink.Link{}, nil) + + c := createFullClient() + frrManagerMock.EXPECT().Init(gomock.Any()).Return(nil) + frrManagerMock.EXPECT().Configure(gomock.Any(), gomock.Any()).Return(true, nil) + frrManagerMock.EXPECT().ReloadFRR().Return(fmt.Errorf("error reloading FRR")) + frrManagerMock.EXPECT().RestartFRR().Return(fmt.Errorf("error restarting FRR")) + r, err := NewNodeNetworkConfigReconciler(c, nil, logr.New(nil), "", + frrManagerMock, nl.NewManager(netlinkMock)) + Expect(err).ToNot(HaveOccurred()) + Expect(r).ToNot(BeNil()) + err = r.Reconcile(context.TODO()) + Expect(err).To(HaveOccurred()) + }) + It("return error if cannot configure networking", func() { + frrManagerMock := mock_frr.NewMockManagerInterface(mockctrl) + netlinkMock := mock_nl.NewMockToolkitInterface(mockctrl) + netlinkMock.EXPECT().LinkList().Return([]netlink.Link{}, nil).Times(3) + netlinkMock.EXPECT().LinkAdd(gomock.Any()).Return(fmt.Errorf("link add error")) + + c := createFullClient() + frrManagerMock.EXPECT().Init(gomock.Any()).Return(nil) + frrManagerMock.EXPECT().Configure(gomock.Any(), gomock.Any()).Return(true, nil) + frrManagerMock.EXPECT().ReloadFRR().Return(nil) + r, err := NewNodeNetworkConfigReconciler(c, nil, logr.New(nil), "", + frrManagerMock, nl.NewManager(netlinkMock)) + Expect(err).ToNot(HaveOccurred()) + Expect(r).ToNot(BeNil()) + err = r.Reconcile(context.TODO()) + Expect(err).To(HaveOccurred()) + }) + }) +}) + +func createClient(initObjs ...runtime.Object) client.Client { + cb := clientBuilder(initObjs...) + return cb.Build() +} + +func createClientWithStatus(ncr, nnc client.Object, initObjs ...runtime.Object) client.Client { + return clientBuilder(initObjs...).WithStatusSubresource(nnc, ncr).Build() +} + +func clientBuilder(initObjs ...runtime.Object) *fake.ClientBuilder { + s := runtime.NewScheme() + err := corev1.AddToScheme(s) + Expect(err).ToNot(HaveOccurred()) + err = v1alpha1.AddToScheme(s) + Expect(err).ToNot(HaveOccurred()) + return fake.NewClientBuilder().WithScheme(s).WithRuntimeObjects(initObjs...) +} + +func createFullClient() client.Client { + fakeNNC := &v1alpha1.NodeNetworkConfigList{} + err := json.Unmarshal([]byte(fakeNNCJSON), fakeNNC) + Expect(err).ShouldNot(HaveOccurred()) + + fakeNCR := &v1alpha1.NetworkConfigRevisionList{} + err = json.Unmarshal([]byte(fakeNCRJSON), fakeNCR) + Expect(err).ShouldNot(HaveOccurred()) + c := clientBuilder(fakeNNC, fakeNCR).WithStatusSubresource(&fakeNNC.Items[0], &fakeNCR.Items[0]).Build() + + return c +}