-
Notifications
You must be signed in to change notification settings - Fork 880
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Add missing locks in agent and service code #1570
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -7,6 +7,7 @@ import ( | |
"net" | ||
"os" | ||
"sort" | ||
"sync" | ||
|
||
"github.com/Sirupsen/logrus" | ||
"github.com/docker/docker/pkg/stringid" | ||
|
@@ -39,6 +40,7 @@ type agent struct { | |
advertiseAddr string | ||
epTblCancel func() | ||
driverCancelFuncs map[string][]func() | ||
sync.Mutex | ||
} | ||
|
||
func getBindAddr(ifaceName string) (string, error) { | ||
|
@@ -86,9 +88,16 @@ func resolveAddr(addrOrInterface string) (string, error) { | |
func (c *controller) handleKeyChange(keys []*types.EncryptionKey) error { | ||
drvEnc := discoverapi.DriverEncryptionUpdate{} | ||
|
||
a := c.agent | ||
a := c.getAgent() | ||
if a == nil { | ||
logrus.Debug("Skipping key change as agent is nil") | ||
return nil | ||
} | ||
|
||
// Find the deleted key. If the deleted key was the primary key, | ||
// a new primary key should be set before removing if from keyring. | ||
c.Lock() | ||
added := []byte{} | ||
deleted := []byte{} | ||
j := len(c.keys) | ||
for i := 0; i < j; { | ||
|
@@ -127,7 +136,7 @@ func (c *controller) handleKeyChange(keys []*types.EncryptionKey) error { | |
if !same { | ||
c.keys = append(c.keys, key) | ||
if key.Subsystem == subsysGossip { | ||
a.networkDB.SetKey(key.Key) | ||
added = key.Key | ||
} | ||
|
||
if key.Subsystem == subsysIPSec { | ||
|
@@ -136,6 +145,11 @@ func (c *controller) handleKeyChange(keys []*types.EncryptionKey) error { | |
} | ||
} | ||
} | ||
c.Unlock() | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Calling There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. How can it deadlock ? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Two concurrent go routines, one takes There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Understand, but the nDB object does not have any reference to the agent or controller. It's a leaf object. From what I see, networkDB methods are properly coded so that they are the only one to exercise the nDB lock over the nDB data. I don't think the situation you are describing can happen. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I took care of it. PTAL There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. yeah, networkdb is currently a separate package without any dependency on libnetwork core. But its safer to avoid nested locks unless its really required. In this case there is no need to call There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. And yes. I agree, being conservative when it comes to holding locks is preferred. Especially when we have locks around a function call (with closely related code), it is much better to avoid it. |
||
|
||
if len(added) > 0 { | ||
a.networkDB.SetKey(added) | ||
} | ||
|
||
key, tag, err := c.getPrimaryKeyTag(subsysGossip) | ||
if err != nil { | ||
|
@@ -166,8 +180,10 @@ func (c *controller) handleKeyChange(keys []*types.EncryptionKey) error { | |
} | ||
|
||
func (c *controller) agentSetup() error { | ||
c.Lock() | ||
clusterProvider := c.cfg.Daemon.ClusterProvider | ||
|
||
agent := c.agent | ||
c.Unlock() | ||
bindAddr := clusterProvider.GetLocalAddress() | ||
advAddr := clusterProvider.GetAdvertiseAddress() | ||
remote := clusterProvider.GetRemoteAddress() | ||
|
@@ -176,7 +192,7 @@ func (c *controller) agentSetup() error { | |
listenAddr, _, _ := net.SplitHostPort(listen) | ||
|
||
logrus.Infof("Initializing Libnetwork Agent Listen-Addr=%s Local-addr=%s Adv-addr=%s Remote-addr =%s", listenAddr, bindAddr, advAddr, remoteAddr) | ||
if advAddr != "" && c.agent == nil { | ||
if advAddr != "" && agent == nil { | ||
if err := c.agentInit(listenAddr, bindAddr, advAddr); err != nil { | ||
logrus.Errorf("Error in agentInit : %v", err) | ||
} else { | ||
|
@@ -208,6 +224,9 @@ func (c *controller) agentSetup() error { | |
// For a given subsystem getKeys sorts the keys by lamport time and returns | ||
// slice of keys and lamport time which can used as a unique tag for the keys | ||
func (c *controller) getKeys(subsys string) ([][]byte, []uint64) { | ||
c.Lock() | ||
defer c.Unlock() | ||
|
||
sort.Sort(ByTime(c.keys)) | ||
|
||
keys := [][]byte{} | ||
|
@@ -227,6 +246,8 @@ func (c *controller) getKeys(subsys string) ([][]byte, []uint64) { | |
// getPrimaryKeyTag returns the primary key for a given subsystem from the | ||
// list of sorted key and the associated tag | ||
func (c *controller) getPrimaryKeyTag(subsys string) ([]byte, uint64, error) { | ||
c.Lock() | ||
defer c.Unlock() | ||
sort.Sort(ByTime(c.keys)) | ||
keys := []*types.EncryptionKey{} | ||
for _, key := range c.keys { | ||
|
@@ -265,13 +286,15 @@ func (c *controller) agentInit(listenAddr, bindAddrOrInterface, advertiseAddr st | |
|
||
ch, cancel := nDB.Watch("endpoint_table", "", "") | ||
|
||
c.Lock() | ||
c.agent = &agent{ | ||
networkDB: nDB, | ||
bindAddr: bindAddr, | ||
advertiseAddr: advertiseAddr, | ||
epTblCancel: cancel, | ||
driverCancelFuncs: make(map[string][]func()), | ||
} | ||
c.Unlock() | ||
|
||
go c.handleTableEvents(ch, c.handleEpTableEvent) | ||
|
||
|
@@ -294,21 +317,22 @@ func (c *controller) agentInit(listenAddr, bindAddrOrInterface, advertiseAddr st | |
} | ||
|
||
func (c *controller) agentJoin(remote string) error { | ||
if c.agent == nil { | ||
agent := c.getAgent() | ||
if agent == nil { | ||
return nil | ||
} | ||
|
||
return c.agent.networkDB.Join([]string{remote}) | ||
return agent.networkDB.Join([]string{remote}) | ||
} | ||
|
||
func (c *controller) agentDriverNotify(d driverapi.Driver) { | ||
if c.agent == nil { | ||
agent := c.getAgent() | ||
if agent == nil { | ||
return | ||
} | ||
|
||
d.DiscoverNew(discoverapi.NodeDiscovery, discoverapi.NodeDiscoveryData{ | ||
Address: c.agent.advertiseAddr, | ||
BindAddress: c.agent.bindAddr, | ||
Address: agent.advertiseAddr, | ||
BindAddress: agent.bindAddr, | ||
Self: true, | ||
}) | ||
|
||
|
@@ -339,11 +363,19 @@ func (c *controller) agentClose() { | |
return | ||
} | ||
|
||
var cancelList []func() | ||
|
||
agent.Lock() | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Can we get the slice of There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I will take care of it. Thanks. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I took care of it. PTAL |
||
for _, cancelFuncs := range agent.driverCancelFuncs { | ||
for _, cancel := range cancelFuncs { | ||
cancel() | ||
cancelList = append(cancelList, cancel) | ||
} | ||
} | ||
agent.Unlock() | ||
|
||
for _, cancel := range cancelList { | ||
cancel() | ||
} | ||
|
||
agent.epTblCancel() | ||
|
||
|
@@ -354,31 +386,33 @@ func (n *network) isClusterEligible() bool { | |
if n.driverScope() != datastore.GlobalScope { | ||
return false | ||
} | ||
|
||
c := n.getController() | ||
if c.agent == nil { | ||
return false | ||
} | ||
|
||
return true | ||
return n.getController().getAgent() != nil | ||
} | ||
|
||
func (n *network) joinCluster() error { | ||
if !n.isClusterEligible() { | ||
return nil | ||
} | ||
|
||
c := n.getController() | ||
return c.agent.networkDB.JoinNetwork(n.ID()) | ||
agent := n.getController().getAgent() | ||
if agent == nil { | ||
return nil | ||
} | ||
|
||
return agent.networkDB.JoinNetwork(n.ID()) | ||
} | ||
|
||
func (n *network) leaveCluster() error { | ||
if !n.isClusterEligible() { | ||
return nil | ||
} | ||
|
||
c := n.getController() | ||
return c.agent.networkDB.LeaveNetwork(n.ID()) | ||
agent := n.getController().getAgent() | ||
if agent == nil { | ||
return nil | ||
} | ||
|
||
return agent.networkDB.LeaveNetwork(n.ID()) | ||
} | ||
|
||
func (ep *endpoint) addDriverInfoToCluster() error { | ||
|
@@ -390,10 +424,7 @@ func (ep *endpoint) addDriverInfoToCluster() error { | |
return nil | ||
} | ||
|
||
ctrlr := n.ctrlr | ||
ctrlr.Lock() | ||
agent := ctrlr.agent | ||
ctrlr.Unlock() | ||
agent := n.getController().getAgent() | ||
if agent == nil { | ||
return nil | ||
} | ||
|
@@ -415,10 +446,7 @@ func (ep *endpoint) deleteDriverInfoFromCluster() error { | |
return nil | ||
} | ||
|
||
ctrlr := n.ctrlr | ||
ctrlr.Lock() | ||
agent := ctrlr.agent | ||
ctrlr.Unlock() | ||
agent := n.getController().getAgent() | ||
if agent == nil { | ||
return nil | ||
} | ||
|
@@ -438,6 +466,7 @@ func (ep *endpoint) addServiceInfoToCluster() error { | |
} | ||
|
||
c := n.getController() | ||
agent := c.getAgent() | ||
if !ep.isAnonymous() && ep.Iface().Address() != nil { | ||
var ingressPorts []*PortConfig | ||
if ep.svcID != "" { | ||
|
@@ -466,8 +495,10 @@ func (ep *endpoint) addServiceInfoToCluster() error { | |
return err | ||
} | ||
|
||
if err := c.agent.networkDB.CreateEntry("endpoint_table", n.ID(), ep.ID(), buf); err != nil { | ||
return err | ||
if agent != nil { | ||
if err := agent.networkDB.CreateEntry("endpoint_table", n.ID(), ep.ID(), buf); err != nil { | ||
return err | ||
} | ||
} | ||
} | ||
|
||
|
@@ -481,6 +512,8 @@ func (ep *endpoint) deleteServiceInfoFromCluster() error { | |
} | ||
|
||
c := n.getController() | ||
agent := c.getAgent() | ||
|
||
if !ep.isAnonymous() { | ||
if ep.svcID != "" && ep.Iface().Address() != nil { | ||
var ingressPorts []*PortConfig | ||
|
@@ -492,9 +525,10 @@ func (ep *endpoint) deleteServiceInfoFromCluster() error { | |
return err | ||
} | ||
} | ||
|
||
if err := c.agent.networkDB.DeleteEntry("endpoint_table", n.ID(), ep.ID()); err != nil { | ||
return err | ||
if agent != nil { | ||
if err := agent.networkDB.DeleteEntry("endpoint_table", n.ID(), ep.ID()); err != nil { | ||
return err | ||
} | ||
} | ||
} | ||
return nil | ||
|
@@ -506,24 +540,23 @@ func (n *network) addDriverWatches() { | |
} | ||
|
||
c := n.getController() | ||
agent := c.getAgent() | ||
if agent == nil { | ||
return | ||
} | ||
for _, tableName := range n.driverTables { | ||
c.Lock() | ||
if c.agent == nil { | ||
c.Unlock() | ||
return | ||
} | ||
ch, cancel := c.agent.networkDB.Watch(tableName, n.ID(), "") | ||
c.agent.driverCancelFuncs[n.ID()] = append(c.agent.driverCancelFuncs[n.ID()], cancel) | ||
c.Unlock() | ||
|
||
ch, cancel := agent.networkDB.Watch(tableName, n.ID(), "") | ||
agent.Lock() | ||
agent.driverCancelFuncs[n.ID()] = append(agent.driverCancelFuncs[n.ID()], cancel) | ||
agent.Unlock() | ||
go c.handleTableEvents(ch, n.handleDriverTableEvent) | ||
d, err := n.driver(false) | ||
if err != nil { | ||
logrus.Errorf("Could not resolve driver %s while walking driver tabl: %v", n.networkType, err) | ||
return | ||
} | ||
|
||
c.agent.networkDB.WalkTable(tableName, func(nid, key string, value []byte) bool { | ||
agent.networkDB.WalkTable(tableName, func(nid, key string, value []byte) bool { | ||
if nid == n.ID() { | ||
d.EventNotify(driverapi.Create, nid, tableName, key, value) | ||
} | ||
|
@@ -538,11 +571,15 @@ func (n *network) cancelDriverWatches() { | |
return | ||
} | ||
|
||
c := n.getController() | ||
c.Lock() | ||
cancelFuncs := c.agent.driverCancelFuncs[n.ID()] | ||
delete(c.agent.driverCancelFuncs, n.ID()) | ||
c.Unlock() | ||
agent := n.getController().getAgent() | ||
if agent == nil { | ||
return | ||
} | ||
|
||
agent.Lock() | ||
cancelFuncs := agent.driverCancelFuncs[n.ID()] | ||
delete(agent.driverCancelFuncs, n.ID()) | ||
agent.Unlock() | ||
|
||
for _, cancel := range cancelFuncs { | ||
cancel() | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -1485,17 +1485,12 @@ func (n *network) Peers() []networkdb.PeerInfo { | |
return []networkdb.PeerInfo{} | ||
} | ||
|
||
var nDB *networkdb.NetworkDB | ||
n.ctrlr.Lock() | ||
if n.ctrlr.agentInitDone == nil && n.ctrlr.agent != nil { | ||
nDB = n.ctrlr.agent.networkDB | ||
agent := n.getController().getAgent() | ||
if agent == nil { | ||
return []networkdb.PeerInfo{} | ||
} | ||
n.ctrlr.Unlock() | ||
|
||
if nDB != nil { | ||
return n.ctrlr.agent.networkDB.Peers(n.id) | ||
} | ||
return []networkdb.PeerInfo{} | ||
return agent.networkDB.Peers(n.ID()) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Is it safe to assume that There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Yes it is safe, because the |
||
} | ||
|
||
func (n *network) DriverOptions() map[string]string { | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
am not too confident on this code-path. So consider this as a question... It looks like SetKey can be potentially called multiple times based on how many unique keys of type "subsysGossip" is present ?
With this change, it seems to be assumed that it will be only once and it is cached in
added
variable which is later used to set Key again. Is that the expectation ?There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Yeah, it follows the existing logic for the deleted key (look for the
deleted
variable).This is the code path for the key rotation, where we know there will only be one deleted and one added key. But @sanimej can correct us here, he initially wrote this logic.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
@mavenugo What aboch said is correct.The key rotation logic adds one new key to the set and removes one. So before this change also only one key would change on a rotation.