Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Issue 357: Integrating health checks #569

Merged
merged 11 commits into from
Aug 17, 2021
Merged
Show file tree
Hide file tree
Changes from 6 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions pkg/apis/pravega/v1beta1/pravega.go
Original file line number Diff line number Diff line change
Expand Up @@ -358,6 +358,7 @@ func (s *PravegaSpec) withDefaults() (changed bool) {
changed = true
s.SegmentStorePodLabels = map[string]string{}
}

if s.ControllerSvcNameSuffix == "" {
changed = true
s.ControllerSvcNameSuffix = "pravega-controller"
Expand Down
10 changes: 5 additions & 5 deletions pkg/apis/pravega/v1beta1/pravegacluster_types.go
Original file line number Diff line number Diff line change
Expand Up @@ -133,7 +133,7 @@ type ClusterSpec struct {
// The version must follow the [semver]( http://semver.org) format, for example "3.2.13".
// Only Pravega released versions are supported: https://github.com/pravega/pravega/releases
//
// If version is not set, default is "0.4.0".
// If version is not set, default value will be set.
// +optional
Version string `json:"version"`

Expand Down Expand Up @@ -207,7 +207,7 @@ func (s *ClusterSpec) withDefaults(p *PravegaCluster) (changed bool) {
s.Pravega.SegmentStorePodAffinity = util.PodAntiAffinity("pravega-segmentstore", p.GetName())
}

if util.IsVersionBelow07(s.Version) && s.Pravega.CacheVolumeClaimTemplate == nil {
if util.IsVersionBelow(s.Version, "0.7.0") && s.Pravega.CacheVolumeClaimTemplate == nil {
changed = true
s.Pravega.CacheVolumeClaimTemplate = &corev1.PersistentVolumeClaimSpec{
AccessModes: []corev1.PersistentVolumeAccessMode{corev1.ReadWriteOnce},
Expand Down Expand Up @@ -454,7 +454,7 @@ func (dst *PravegaCluster) updatePravegaOwnerReferences() error {
}

func (dst *PravegaCluster) updateSSSReferences(ownerRefs []metav1.OwnerReference) error {
if util.IsVersionBelow07(dst.Spec.Version) {
if util.IsVersionBelow(dst.Spec.Version, "0.7.0") {
numPvcs := int(dst.Spec.Pravega.SegmentStoreReplicas)
for i := 0; i < numPvcs; i++ {
pvcName := "cache-" + dst.StatefulSetNameForSegmentstoreBelow07() + "-" + strconv.Itoa(i)
Expand Down Expand Up @@ -1196,7 +1196,7 @@ func (p *PravegaCluster) validateConfigMap() error {

//to return name of segmentstore based on the version
func (p *PravegaCluster) StatefulSetNameForSegmentstore() string {
if util.IsVersionBelow07(p.Spec.Version) {
if util.IsVersionBelow(p.Spec.Version, "0.7.0") {
return p.StatefulSetNameForSegmentstoreBelow07()
}
return p.StatefulSetNameForSegmentstoreAbove07()
Expand Down Expand Up @@ -1278,7 +1278,7 @@ func (p *PravegaCluster) ServiceNameForController() string {
}

func (p *PravegaCluster) ServiceNameForSegmentStore(index int32) string {
if util.IsVersionBelow07(p.Spec.Version) {
if util.IsVersionBelow(p.Spec.Version, "0.7.0") {
return p.ServiceNameForSegmentStoreBelow07(index)
}
return p.ServiceNameForSegmentStoreAbove07(index)
Expand Down
4 changes: 2 additions & 2 deletions pkg/controller/pravega/pravega_controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -170,7 +170,7 @@ func makeControllerPodSpec(p *api.PravegaCluster) *corev1.PodSpec {
ReadinessProbe: &corev1.Probe{
Handler: corev1.Handler{
Exec: &corev1.ExecAction{
Command: util.ControllerReadinessCheck(10080, p.Spec.Authentication.IsEnabled()),
Command: util.ControllerReadinessCheck(p.Spec.Version, 10080, p.Spec.Authentication.IsEnabled()),
},
},
// Controller pods start fast. We give it up to 20 seconds to become ready.
Expand All @@ -181,7 +181,7 @@ func makeControllerPodSpec(p *api.PravegaCluster) *corev1.PodSpec {
LivenessProbe: &corev1.Probe{
Handler: corev1.Handler{
Exec: &corev1.ExecAction{
Command: util.HealthcheckCommand(9090),
Command: util.HealthcheckCommand(p.Spec.Version, 9090, 10080),
},
},
// We start the liveness probe from the maximum time the pod can take
Expand Down
8 changes: 4 additions & 4 deletions pkg/controller/pravega/pravega_segmentstore.go
Original file line number Diff line number Diff line change
Expand Up @@ -55,7 +55,7 @@ func MakeSegmentStoreStatefulSet(p *api.PravegaCluster) *appsv1.StatefulSet {
},
},
}
if util.IsVersionBelow07(p.Spec.Version) {
if util.IsVersionBelow(p.Spec.Version, "0.7.0") {
statefulSet.Spec.VolumeClaimTemplates = makeCacheVolumeClaimTemplate(p)
}
return statefulSet
Expand Down Expand Up @@ -179,7 +179,7 @@ func makeSegmentstorePodSpec(p *api.PravegaCluster) corev1.PodSpec {
volumeMounts = append(volumeMounts, m)
}
}
if util.IsVersionBelow07(p.Spec.Version) {
if util.IsVersionBelow(p.Spec.Version, "0.7.0") {
volumeMounts = append(volumeMounts, corev1.VolumeMount{
Name: cacheVolumeName,
MountPath: cacheVolumeMountPoint,
Expand Down Expand Up @@ -209,7 +209,7 @@ func makeSegmentstorePodSpec(p *api.PravegaCluster) corev1.PodSpec {
ReadinessProbe: &corev1.Probe{
Handler: corev1.Handler{
Exec: &corev1.ExecAction{
Command: util.HealthcheckCommand(int32(containerport)),
Command: util.SegmentStoreReadinessCheck(p.Spec.Version, int32(containerport), 6061),
},
},
// Segment Stores can take a few minutes to become ready when the cluster
Expand All @@ -222,7 +222,7 @@ func makeSegmentstorePodSpec(p *api.PravegaCluster) corev1.PodSpec {
LivenessProbe: &corev1.Probe{
Handler: corev1.Handler{
Exec: &corev1.ExecAction{
Command: util.HealthcheckCommand(int32(containerport)),
Command: util.HealthcheckCommand(p.Spec.Version, int32(containerport), 6061),
},
},
// In the readiness probe we allow the pod to take up to 5 minutes
Expand Down
4 changes: 2 additions & 2 deletions pkg/controller/pravegacluster/pravegacluster_controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -527,7 +527,7 @@ func (r *ReconcilePravegaCluster) deployCluster(p *pravegav1beta1.PravegaCluster
return err
}

if !util.IsVersionBelow07(p.Spec.Version) {
if !util.IsVersionBelow(p.Spec.Version, "0.7.0") {
newsts := &appsv1.StatefulSet{}
name := p.StatefulSetNameForSegmentstoreAbove07()
err = r.client.Get(context.TODO(),
Expand Down Expand Up @@ -1026,7 +1026,7 @@ func (r *ReconcilePravegaCluster) isRollbackTriggered(p *pravegav1beta1.PravegaC

//this function will return true only in case of upgrading from a version below 0.7 to pravega version 0.7 or later
func (r *ReconcilePravegaCluster) IsClusterUpgradingTo07(p *pravegav1beta1.PravegaCluster) bool {
if !util.IsVersionBelow07(p.Spec.Version) && util.IsVersionBelow07(p.Status.CurrentVersion) {
if !util.IsVersionBelow(p.Spec.Version, "0.7.0") && util.IsVersionBelow(p.Status.CurrentVersion, "0.7.0") {
return true
}
return false
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -235,7 +235,6 @@ var _ = Describe("PravegaCluster Controller", func() {
Ω(foundPravega.Spec.ExternalAccess.Enabled).Should(Equal(false))
Ω(foundPravega.Spec.ExternalAccess.DomainName).Should(Equal(""))
Ω(foundPravega.Spec.Pravega).ShouldNot(BeNil())
fmt.Println("DEFAULTS ARE SET")
})
})

Expand Down
4 changes: 2 additions & 2 deletions pkg/controller/pravegacluster/upgrade.go
Original file line number Diff line number Diff line change
Expand Up @@ -408,7 +408,7 @@ func (r *ReconcilePravegaCluster) syncSegmentStoreVersion(p *pravegav1beta1.Prav

//this function is to check are we doing a rollback in case of a upgrade failure while upgrading from a version below 07 to a version above 07
func (r *ReconcilePravegaCluster) IsClusterRollbackingFrom07(p *pravegav1beta1.PravegaCluster) bool {
if util.IsVersionBelow07(p.Spec.Version) && r.IsAbove07STSPresent(p) {
if util.IsVersionBelow(p.Spec.Version, "0.7.0") && r.IsAbove07STSPresent(p) {
return true
}
return false
Expand Down Expand Up @@ -452,7 +452,7 @@ func (r *ReconcilePravegaCluster) deleteExternalServices(p *pravegav1beta1.Prave
var name string = ""
for i := int32(0); i < p.Spec.Pravega.SegmentStoreReplicas; i++ {
service := &corev1.Service{}
if !util.IsVersionBelow07(p.Spec.Version) {
if !util.IsVersionBelow(p.Spec.Version, "0.7.0") {
name = p.ServiceNameForSegmentStoreBelow07(i)
} else {
name = p.ServiceNameForSegmentStoreAbove07(i)
Expand Down
77 changes: 52 additions & 25 deletions pkg/util/pravegacluster.go
Original file line number Diff line number Diff line change
Expand Up @@ -28,19 +28,23 @@ var (
)

const (
compareVersion string = "0.10.0"
MajorMinorVersionRegexp string = `^v?(?P<Version>[0-9]+\.[0-9]+\.[0-9]+)`
)

func init() {
versionRegexp = regexp.MustCompile(MajorMinorVersionRegexp)
}

//function to check if the version is below 0.7 or not
func IsVersionBelow07(ver string) bool {
if ver == "" {
//function to check if v1 is below v2 or not
func IsVersionBelow(v1 string, v2 string) bool {
if v1 == "" {
return true
}
result, _ := CompareVersions(ver, "0.7.0", "<")
if v2 == "" {
return false
}
result, _ := CompareVersions(v1, v2, "<")
if result {
return true
}
Expand Down Expand Up @@ -69,29 +73,52 @@ func IsOrphan(k8sObjectName string, replicas int32) bool {
return int32(ordinal) >= replicas
}

func HealthcheckCommand(port int32) []string {
return []string{"/bin/sh", "-c", fmt.Sprintf("netstat -ltn 2> /dev/null | grep %d || ss -ltn 2> /dev/null | grep %d", port, port)}
func HealthcheckCommand(version string, port int32, restport int32) []string {
command := ""
if IsVersionBelow(version, compareVersion) {
command = fmt.Sprintf("netstat -ltn 2> /dev/null | grep %d || ss -ltn 2> /dev/null | grep %d", port, port)
} else {
command = fmt.Sprintf("(netstat -ltn 2> /dev/null | grep %d || ss -ltn 2> /dev/null | grep %d) && (curl -s -X GET 'http://localhost:%d/v1/health/liveness' || curl -s -k -X GET 'https://localhost:%d/v1/health/liveness')", port, port, restport, restport)

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

is it really required to check the port using netstat? Why not to send a REST request right away and check for return code?

It might happen that netstat will not be present in pravega containers starting from 0.10...

}
return []string{"/bin/sh", "-c", command}
}

func ControllerReadinessCheck(version string, port int32, authflag bool) []string {
command := ""
if IsVersionBelow(version, compareVersion) {
//This function check for the readiness of the controller in the following cases
//1) Auth and TLS Enabled- in this case, we check if the controller is properly enabled with authentication or not and we do a get on controller and with dummy credentials(testtls:testtls) and the controller returns 401 error in this case if it's correctly configured
//2) Auth Enabled and TLS Disabled- in this case, we check if the controller is properly enabled with authentication or not and we do a get on controller and with dummy credentials(testtls:testtls) and the controller returns 401 error in this case if it's correctly configured
//3) Auth Disabled and TLS Enabled- in this case, we check if the controller can create scopes or not by checking if _system scope is present or not
//4) Auth and TLS Disabled- in this case, we check if the controller can create scopes or not by checking if _system scope is present or not
if authflag == true {
// This is to check the readiness of controller in case auth is Enabled
// here we are using login credential as testtls:testtls which should
// not be used as auth credential and we depend on controller giving us
// 401 error which means controller is properly configured with auth
// it checks both cases when tls is enabled as well as tls disabled
// with auth enabled
command = fmt.Sprintf("echo $JAVA_OPTS | grep 'controller.auth.tlsEnabled=true' && curl -v -k -u testtls:testtls -s -X GET 'https://localhost:%d/v1/scopes/' 2>&1 -H 'accept: application/json' | grep 401 || (echo $JAVA_OPTS | grep 'controller.auth.tlsEnabled=false' && curl -v -k -u testtls:testtls -s -X GET 'http://localhost:%d/v1/scopes/' 2>&1 -H 'accept: application/json' | grep 401 ) || (echo $JAVA_OPTS | grep 'controller.security.tls.enable=true' && echo $JAVA_OPTS | grep -v 'controller.auth.tlsEnabled' && curl -v -k -u testtls:testtls -s -X GET 'https://localhost:%d/v1/scopes/' 2>&1 -H 'accept: application/json' | grep 401 ) || (curl -v -k -u testtls:testtls -s -X GET 'http://localhost:%d/v1/scopes/' 2>&1 -H 'accept: application/json' | grep 401 )", port, port, port, port)
SrishT marked this conversation as resolved.
Show resolved Hide resolved
} else {
// This is to check the readiness in case auth is not enabled
// and it covers both the cases with tls enabled and tls disabled
// along with auth disabled
command = fmt.Sprintf("echo $JAVA_OPTS | grep 'controller.auth.tlsEnabled=true' && curl -s -X GET 'https://localhost:%d/v1/scopes/' -H 'accept: application/json' | grep '_system'|| (echo $JAVA_OPTS | grep 'controller.auth.tlsEnabled=false' && curl -s -X GET 'http://localhost:%d/v1/scopes/' -H 'accept: application/json' | grep '_system' ) || (echo $JAVA_OPTS | grep 'controller.security.tls.enable=true' && echo $JAVA_OPTS | grep -v 'controller.auth.tlsEnabled' && curl -s -X GET 'https://localhost:%d/v1/scopes/' -H 'accept: application/json' | grep '_system' ) || (curl -s -X GET 'http://localhost:%d/v1/scopes/' -H 'accept: application/json' | grep '_system') ", port, port, port, port)
}
} else {
command = fmt.Sprintf("curl -s -X GET 'http://localhost:%d/v1/health/readiness' || curl -s -k -X GET 'https://localhost:%d/v1/health/readiness'", port, port)
Copy link
Contributor

@sarlaccpit sarlaccpit Aug 16, 2021

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Is there something in the readiness call payload that indicates whether or not pravega controller is running with auth on (I am not suggesting to send bad credentials to test out a 401 in return :) rather, just a field that literally says "auth: enabled" in the health readiness response). If so, then if auth was requested in the java options, we should find in the readiness a confirmation that auth is on. And the opposite if auth was not set or set to off in the java options.

If that's not something that is in the readiness payload today; I think it should be added, but there is nothing for this PR to leverage and line 109 is the best it can be at this time.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@sarlaccpit the /liveness and /readiness endpoints are configured to return only a boolean response. There is another /getDetails endpoint which is exposed, which can be configured to return additional details like the one you mentioned, but this endpoint can only be accessed by authenticated users.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Got it. I think it's fine the way it is then. It makes sense the auth/no auth (and probably many other details) are protected behind the get details endpoint.

}
return []string{"/bin/sh", "-c", command}
}

//This function check for the readiness of the controller in the following cases
//1) Auth and TLS Enabled- in this case, we check if the controller is properly enabled with authentication or not and we do a get on controller and with dummy credentials(testtls:testtls) and the controller returns 401 error in this case if it's correctly configured
//2) Auth Enabled and TLS Disabled- in this case, we check if the controller is properly enabled with authentication or not and we do a get on controller and with dummy credentials(testtls:testtls) and the controller returns 401 error in this case if it's correctly configured
//3) Auth Disabled and TLS Enabled- in this case, we check if the controller can create scopes or not by checking if _system scope is present or not
//4) Auth and TLS Disabled- in this case, we check if the controller can create scopes or not by checking if _system scope is present or not
func ControllerReadinessCheck(port int32, authflag bool) []string {
// This is to check the readiness of controller in case auth is Enabled
// here we are using login credential as testtls:testtls which should
// not be used as auth credential and we depend on controller giving us
// 401 error which means controller is properly configured with auth
// it checks both cases when tls is enabled as well as tls disabled
// with auth enabled
if authflag == true {
return []string{"/bin/sh", "-c", fmt.Sprintf("echo $JAVA_OPTS | grep 'controller.auth.tlsEnabled=true' && curl -v -k -u testtls:testtls -s -X GET 'https://localhost:%d/v1/scopes/' 2>&1 -H 'accept: application/json' | grep 401 || (echo $JAVA_OPTS | grep 'controller.auth.tlsEnabled=false' && curl -v -k -u testtls:testtls -s -X GET 'http://localhost:%d/v1/scopes/' 2>&1 -H 'accept: application/json' | grep 401 ) || (echo $JAVA_OPTS | grep 'controller.security.tls.enable=true' && echo $JAVA_OPTS | grep -v 'controller.auth.tlsEnabled' && curl -v -k -u testtls:testtls -s -X GET 'https://localhost:%d/v1/scopes/' 2>&1 -H 'accept: application/json' | grep 401 ) || (curl -v -k -u testtls:testtls -s -X GET 'http://localhost:%d/v1/scopes/' 2>&1 -H 'accept: application/json' | grep 401 )", port, port, port, port)}
}
// This is to check the readiness in case auth is not enabled
// and it covers both the cases with tls enabled and tls disabled
// along with auth disabled
return []string{"/bin/sh", "-c", fmt.Sprintf("echo $JAVA_OPTS | grep 'controller.auth.tlsEnabled=true' && curl -s -X GET 'https://localhost:%d/v1/scopes/' -H 'accept: application/json' | grep '_system'|| (echo $JAVA_OPTS | grep 'controller.auth.tlsEnabled=false' && curl -s -X GET 'http://localhost:%d/v1/scopes/' -H 'accept: application/json' | grep '_system' ) || (echo $JAVA_OPTS | grep 'controller.security.tls.enable=true' && echo $JAVA_OPTS | grep -v 'controller.auth.tlsEnabled' && curl -s -X GET 'https://localhost:%d/v1/scopes/' -H 'accept: application/json' | grep '_system' ) || (curl -s -X GET 'http://localhost:%d/v1/scopes/' -H 'accept: application/json' | grep '_system') ", port, port, port, port)}
func SegmentStoreReadinessCheck(version string, port int32, restport int32) []string {
command := ""
if IsVersionBelow(version, compareVersion) {
command = fmt.Sprintf("netstat -ltn 2> /dev/null | grep %d || ss -ltn 2> /dev/null | grep %d", port, port)
} else {
command = fmt.Sprintf("curl -s -X GET 'http://localhost:%d/v1/health/readiness' || curl -s -k -X GET 'https://localhost:%d/v1/health/readiness'", restport, restport)
}
return []string{"/bin/sh", "-c", command}
}

// Min returns the smaller of x or y.
Expand Down
Loading