Skip to content

Commit

Permalink
Detect connection loss through IRC PONG
Browse files Browse the repository at this point in the history
In the current state, the alertmanager-irc-relay already sends minutely
IRC PINGs. This allows to check the IRC connection's health in protocol
without having to deal with specific TCP settings. However, even when
we are sending those PINGs, we don't process the server's PONGs or their
absence.

On one of my alertmanager-irc-relay instances, the time between the last
received PONG and the TCP read to fail was round about fifteen minutes.
All this time, the connection was already dead, but there was no attempt
to reestablish it.

The introduces changes keep book on the last received PONG and fails if
there was no new PONG within twice the pingFrequencySecs time. When
establishing a new connection during the SetupPhase, the current time
will be set as the last PONG's time to reset the time comparison.
  • Loading branch information
oxzi committed Jun 7, 2023
1 parent d47139a commit 5aed959
Show file tree
Hide file tree
Showing 3 changed files with 146 additions and 2 deletions.
2 changes: 2 additions & 0 deletions config.go
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,7 @@ type Config struct {
IRCHostPass string `yaml:"irc_host_password"`
IRCUseSSL bool `yaml:"irc_use_ssl"`
IRCVerifySSL bool `yaml:"irc_verify_ssl"`
IRCPingSecs int `yaml:"irc_ping_secs"`
IRCChannels []IRCChannel `yaml:"irc_channels"`
MsgTemplate string `yaml:"msg_template"`
MsgOnce bool `yaml:"msg_once_per_alert_group"`
Expand All @@ -66,6 +67,7 @@ func LoadConfig(configFile string) (*Config, error) {
IRCHostPass: "",
IRCUseSSL: true,
IRCVerifySSL: true,
IRCPingSecs: 60,
IRCChannels: []IRCChannel{},
MsgOnce: false,
UsePrivmsg: false,
Expand Down
34 changes: 32 additions & 2 deletions irc.go
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,6 @@ import (
)

const (
pingFrequencySecs = 60
connectionTimeoutSecs = 30
nickservWaitSecs = 10
ircConnectMaxBackoffSecs = 300
Expand Down Expand Up @@ -69,7 +68,7 @@ func makeGOIRCConfig(config *Config) *irc.Config {
ServerName: config.IRCHost,
InsecureSkipVerify: !config.IRCVerifySSL,
}
ircConfig.PingFreq = pingFrequencySecs * time.Second
ircConfig.PingFreq = time.Duration(config.IRCPingSecs) * time.Second
ircConfig.Timeout = connectionTimeoutSecs * time.Second
ircConfig.NewNick = func(n string) string { return n + "^" }

Expand Down Expand Up @@ -102,6 +101,9 @@ type IRCNotifier struct {
sessionUp bool
sessionUpSignal chan bool
sessionDownSignal chan bool
sessionPongSignal chan bool
sessionPingOnce sync.Once
sessionLastPong time.Time
sessionWg sync.WaitGroup

channelReconciler *ChannelReconciler
Expand Down Expand Up @@ -136,6 +138,7 @@ func NewIRCNotifier(config *Config, alertMsgs chan AlertMsg, delayerMaker Delaye
AlertMsgs: alertMsgs,
sessionUpSignal: make(chan bool),
sessionDownSignal: make(chan bool),
sessionPongSignal: make(chan bool),
channelReconciler: channelReconciler,
UsePrivmsg: config.UsePrivmsg,
NickservDelayWait: nickservWaitSecs * time.Second,
Expand Down Expand Up @@ -166,6 +169,11 @@ func (n *IRCNotifier) registerHandlers() {
n.HandleNotice(line.Nick, line.Text())
})

n.Client.HandleFunc(irc.PONG,
func(_ *irc.Conn, line *irc.Line) {
n.sessionPongSignal <- true
})

for _, event := range []string{"433"} {
n.Client.HandleFunc(event, loggerHandler)
}
Expand Down Expand Up @@ -282,6 +290,7 @@ func (n *IRCNotifier) ShutdownPhase() {
logging.Info("Wait for IRC disconnect to complete")
select {
case <-n.sessionDownSignal:
case <-n.sessionPongSignal:
case <-n.timeTeller.After(n.Client.Config().Timeout):
logging.Warn("Timeout while waiting for IRC disconnect to complete, stopping anyway")
}
Expand All @@ -294,6 +303,23 @@ func (n *IRCNotifier) ConnectedPhase(ctx context.Context) {
select {
case alertMsg := <-n.AlertMsgs:
n.SendAlertMsg(ctx, &alertMsg)
case <-n.sessionPongSignal:
logging.Debug("Received a PONG message; prev PONG was at %v", n.sessionLastPong)
n.sessionLastPong = time.Now()
case <-time.After(2*n.IrcConfig.PingFreq - time.Since(n.sessionLastPong)):
// Calling n.Client.Close() will trigger n.sessionDownSignal. However, as
// this also dispatches a hook, which we will catch as sessionDownSignal,
// this needs to be done in a concurrent fashion if we don't want to
// deadlock ourself.
//
// Furthermore, as this time.After(...) interval is now zero, it will also
// trigger when visiting this select the next time. To mitigate multiple
// Close() calls, it is wrapped within an sync.Once which will be reset
// during SetupPhase's sessionUpSignal.
n.sessionPingOnce.Do(func() {
logging.Error("Haven't received a PONG after twice the PING period")
go n.Client.Close()
})
case <-n.sessionDownSignal:
n.sessionUp = false
n.sessionWg.Done()
Expand Down Expand Up @@ -325,13 +351,17 @@ func (n *IRCNotifier) SetupPhase(ctx context.Context) {
select {
case <-n.sessionUpSignal:
n.sessionUp = true
n.sessionPingOnce = sync.Once{}
n.sessionLastPong = time.Now()
n.sessionWg.Add(1)
n.MaybeGhostNick()
n.MaybeWaitForNickserv()
n.channelReconciler.Start(ctx)
ircConnectedGauge.Set(1)
case <-n.sessionDownSignal:
logging.Warn("Receiving a session down before the session is up, this is odd")
case <-n.sessionPongSignal:
logging.Warn("Receiving a PONG before the session is up, this is odd")
case <-ctx.Done():
logging.Info("IRC routine asked to terminate")
}
Expand Down
112 changes: 112 additions & 0 deletions irc_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,7 @@ func makeTestIRCConfig(IRCPort int) *Config {
IRCHost: "127.0.0.1",
IRCPort: IRCPort,
IRCUseSSL: false,
IRCPingSecs: 60,
IRCChannels: []IRCChannel{
IRCChannel{Name: "#foo"},
},
Expand Down Expand Up @@ -466,6 +467,66 @@ func TestReconnectNickIdentChange(t *testing.T) {
}
}

func TestReconnectMissingPong(t *testing.T) {
server, port := makeTestServer(t)
config := makeTestIRCConfig(port)
config.IRCPingSecs = 2
notifier, _, ctx, cancel, stopWg := makeTestNotifier(t, config)

var testStep sync.WaitGroup

joinHandler := func(conn *bufio.ReadWriter, line *irc.Line) error {
testStep.Done()
return hJOIN(conn, line)
}
server.SetHandler("JOIN", joinHandler)

testStep.Add(1)
go notifier.Run(ctx, stopWg)

// Wait until the pre-joined channel is seen.
testStep.Wait()

// Wait for a client disconnect due to missing PONGs...
testStep.Add(1)

// Wait again until the pre-joined channel is seen.
testStep.Wait()

cancel()
stopWg.Wait()

server.Stop()

expectedCommands := []string{
// Commands from first connection
"NICK foo",
"USER foo 12 * :",
"PRIVMSG ChanServ :UNBAN #foo",
"JOIN #foo",
// Ping commands contain timestamps; note the modified check below!
// Commands from reconnection
"NICK foo",
"USER foo 12 * :",
"PRIVMSG ChanServ :UNBAN #foo",
"JOIN #foo",
"QUIT :see ya",
}

logPos := 0
for _, cmd := range server.Log {
if !strings.HasPrefix(cmd, "PING :") {
server.Log[logPos] = cmd
logPos++
}
}
server.Log = server.Log[:logPos]

if !reflect.DeepEqual(expectedCommands, server.Log) {
t.Error("Reconnection did not happen correctly. Received commands:\n", strings.Join(server.Log, "\n"))
}
}

func TestConnectErrorRetry(t *testing.T) {
server, port := makeTestServer(t)
config := makeTestIRCConfig(port)
Expand Down Expand Up @@ -674,3 +735,54 @@ func TestStopRunningWhenHalfConnected(t *testing.T) {
t.Error("Alert not sent correctly. Received commands:\n", strings.Join(server.Log, "\n"))
}
}

func TestPingPong(t *testing.T) {
server, port := makeTestServer(t)
config := makeTestIRCConfig(port)
config.IRCPingSecs = 1
notifier, _, ctx, cancel, stopWg := makeTestNotifier(t, config)

var testStep sync.WaitGroup

pingHandler := func(conn *bufio.ReadWriter, line *irc.Line) error {
testStep.Done()
r := fmt.Sprintf(":example.com PONG example.com :%s", line.Args[0])
_, err := conn.WriteString(r)
return err
}
server.SetHandler("PING", pingHandler)

testStep.Add(3)
go notifier.Run(ctx, stopWg)

// Wait until three PING-PONGs have been exchanged..
testStep.Wait()

cancel()
stopWg.Wait()

server.Stop()

expectedCommands := []string{
// Commands from first connection
"NICK foo",
"USER foo 12 * :",
"PRIVMSG ChanServ :UNBAN #foo",
"JOIN #foo",
// Ping commands contain timestamps; note the modified check below!
"PING :__TS1__",
"PING :__TS2__",
"PING :__TS3__",
"QUIT :see ya",
}

expectedCommandsCheck := len(expectedCommands) == len(server.Log) &&
reflect.DeepEqual(expectedCommands[:4], server.Log[:4]) &&
strings.HasPrefix(server.Log[4], "PING :") &&
strings.HasPrefix(server.Log[5], "PING :") &&
strings.HasPrefix(server.Log[6], "PING :") &&
reflect.DeepEqual(expectedCommands[7:], server.Log[7:])
if !expectedCommandsCheck {
t.Error("Reconnection did not happen correctly. Received commands:\n", strings.Join(server.Log, "\n"))
}
}

0 comments on commit 5aed959

Please sign in to comment.