Skip to content

Commit e393677

Browse files
committed
[FAB-2157] Retry connection to bootstrap peers
In gossip, if a peer starts up and the bootstrap peers aren't available, a reconnection attempt isn't being made. This is risky especially in automatically-managed environments like the cloud where machines can come up automatically and there is a large number of them and no one analyzes their logs in real-time. I didn't add a new test, but I re-arranged a test in the discovery layer and in the gossip layer and made the bootstrap peers start after the rest of the peers. Change-Id: Ib49179cfd4d17e1ed9c6b33ee522769ec4efc082 Signed-off-by: Yacov Manevich <[email protected]>
1 parent 776b629 commit e393677

File tree

3 files changed

+44
-24
lines changed

3 files changed

+44
-24
lines changed

gossip/discovery/discovery_impl.go

+28-15
Original file line numberDiff line numberDiff line change
@@ -112,7 +112,7 @@ func NewDiscoveryService(bootstrapPeers []string, self NetworkMember, comm CommS
112112
go d.periodicalReconnectToDead()
113113
go d.handlePresumedDeadPeers()
114114

115-
d.connect2BootstrapPeers(bootstrapPeers)
115+
go d.connect2BootstrapPeers(bootstrapPeers)
116116

117117
d.logger.Info("Started", self, "incTime is", d.incTime)
118118

@@ -156,22 +156,35 @@ func (d *gossipDiscoveryImpl) Connect(member NetworkMember) {
156156
func (d *gossipDiscoveryImpl) connect2BootstrapPeers(endpoints []string) {
157157
d.logger.Info("Entering:", endpoints)
158158
defer d.logger.Info("Exiting")
159-
wg := sync.WaitGroup{}
160-
req := d.createMembershipRequest()
161-
for _, endpoint := range endpoints {
162-
wg.Add(1)
163-
go func(endpoint string) {
164-
defer wg.Done()
165-
peer := &NetworkMember{
166-
Endpoint: endpoint,
167-
InternalEndpoint: &proto.SignedEndpoint{
159+
160+
for !d.somePeerIsKnown() {
161+
var wg sync.WaitGroup
162+
req := d.createMembershipRequest()
163+
wg.Add(len(endpoints))
164+
for _, endpoint := range endpoints {
165+
go func(endpoint string) {
166+
defer wg.Done()
167+
peer := &NetworkMember{
168168
Endpoint: endpoint,
169-
},
170-
}
171-
d.comm.SendToPeer(peer, req)
172-
}(endpoint)
169+
InternalEndpoint: &proto.SignedEndpoint{
170+
Endpoint: endpoint,
171+
},
172+
}
173+
if !d.comm.Ping(peer) {
174+
return
175+
}
176+
d.comm.SendToPeer(peer, req)
177+
}(endpoint)
178+
}
179+
wg.Wait()
180+
time.Sleep(reconnectInterval)
173181
}
174-
wg.Wait()
182+
}
183+
184+
func (d *gossipDiscoveryImpl) somePeerIsKnown() bool {
185+
d.lock.RLock()
186+
defer d.lock.RUnlock()
187+
return len(d.aliveLastTS) != 0
175188
}
176189

177190
func (d *gossipDiscoveryImpl) InitiateSync(peerNum int) {

gossip/discovery/discovery_test.go

+9-6
Original file line numberDiff line numberDiff line change
@@ -404,19 +404,22 @@ func TestGetFullMembership(t *testing.T) {
404404
nodeNum := 15
405405
bootPeers := []string{bootPeer(5511), bootPeer(5512)}
406406
instances := []*gossipInstance{}
407-
408-
inst := createDiscoveryInstance(5511, "d1", bootPeers)
409-
instances = append(instances, inst)
410-
411-
inst = createDiscoveryInstance(5512, "d2", bootPeers)
412-
instances = append(instances, inst)
407+
var inst *gossipInstance
413408

414409
for i := 3; i <= nodeNum; i++ {
415410
id := fmt.Sprintf("d%d", i)
416411
inst = createDiscoveryInstance(5510+i, id, bootPeers)
417412
instances = append(instances, inst)
418413
}
419414

415+
time.Sleep(time.Second)
416+
417+
inst = createDiscoveryInstance(5511, "d1", bootPeers)
418+
instances = append(instances, inst)
419+
420+
inst = createDiscoveryInstance(5512, "d2", bootPeers)
421+
instances = append(instances, inst)
422+
420423
assertMembership(t, instances, nodeNum-1)
421424

422425
// Ensure that internal endpoint was propagated to everyone

gossip/gossip/gossip_test.go

+7-3
Original file line numberDiff line numberDiff line change
@@ -230,9 +230,7 @@ func TestPull(t *testing.T) {
230230

231231
n := 5
232232
msgsCount2Send := 10
233-
boot := newGossipInstanceWithOnlyPull(portPrefix, 0, 100)
234-
boot.JoinChan(&joinChanMsg{}, common.ChainID("A"))
235-
boot.UpdateChannelMetadata([]byte("bla bla"), common.ChainID("A"))
233+
236234
peers := make([]Gossip, n)
237235
wg := sync.WaitGroup{}
238236
wg.Add(n)
@@ -247,6 +245,12 @@ func TestPull(t *testing.T) {
247245
}
248246
wg.Wait()
249247

248+
time.Sleep(time.Second)
249+
250+
boot := newGossipInstanceWithOnlyPull(portPrefix, 0, 100)
251+
boot.JoinChan(&joinChanMsg{}, common.ChainID("A"))
252+
boot.UpdateChannelMetadata([]byte("bla bla"), common.ChainID("A"))
253+
250254
knowAll := func() bool {
251255
for i := 1; i <= n; i++ {
252256
neighborCount := len(peers[i-1].Peers())

0 commit comments

Comments
 (0)