Skip to content

Commit e1467b8

Browse files
author
Marko Vukolic
committed
fix sbft hello msg issue
After receiveing a hello message, a replica would not cleanup its sbft state (checkpointDone and timer cancellation). This would leave a replica hanging not able to process new requests. Added a test (TestHelloMsg) to demonstrate this. This required also enabling sbft TestSystem configuration with disabled Timer interface. Change-Id: I4f8dc7e57c656a09ad536b229c9709213c5eb6af Signed-off-by: Marko Vukolic <[email protected]>
1 parent 3ef851e commit e1467b8

File tree

6 files changed

+91
-14
lines changed

6 files changed

+91
-14
lines changed

orderer/sbft/simplebft/checkpoint.go

+1-5
Original file line numberDiff line numberDiff line change
@@ -84,7 +84,6 @@ func (s *SBFT) handleCheckpoint(c *Checkpoint, src uint64) {
8484
cp := s.cur.checkpoint[r]
8585
cpset[r] = cp.Signature
8686
}
87-
s.cur.checkpointDone = true
8887

8988
c = s.cur.checkpoint[replicas[0]]
9089

@@ -98,10 +97,7 @@ func (s *SBFT) handleCheckpoint(c *Checkpoint, src uint64) {
9897
batch := *s.cur.preprep.Batch
9998
batch.Signatures = cpset
10099
s.deliverBatch(&batch)
101-
102-
s.cur.timeout.Cancel()
103-
log.Infof("replica %d: request %s %s completed on %d", s.id, s.cur.subject.Seq, hash2str(s.cur.subject.Digest), s.id)
104-
100+
log.Infof("replica %d: request %s %s delivered on %d (completed common case)", s.id, s.cur.subject.Seq, hash2str(s.cur.subject.Digest), s.id)
105101
s.maybeSendNextBatch()
106102
s.processBacklog()
107103
}

orderer/sbft/simplebft/connection.go

+3
Original file line numberDiff line numberDiff line change
@@ -66,12 +66,15 @@ func (s *SBFT) Connection(replica uint64) {
6666

6767
func (s *SBFT) handleHello(h *Hello, src uint64) {
6868
bh, err := s.checkBatch(h.Batch, false, true)
69+
log.Debugf("replica %d: got hello for batch %d from replica %d", s.id, bh.Seq, src)
70+
6971
if err != nil {
7072
log.Warningf("replica %d: invalid hello batch from %d: %s", s.id, src, err)
7173
return
7274
}
7375

7476
if s.sys.LastBatch().DecodeHeader().Seq < bh.Seq {
77+
log.Debugf("replica %d: delivering batch %d after hello from replica %d", s.id, bh.Seq, src)
7578
s.deliverBatch(h.Batch)
7679
}
7780

orderer/sbft/simplebft/newview.go

-1
Original file line numberDiff line numberDiff line change
@@ -199,7 +199,6 @@ func (s *SBFT) maybeDeliverUsingXset(nv *NewView) {
199199
// we just received a signature set for a request which we preprepared, but never delivered.
200200
prevBatch.Payloads = s.cur.preprep.Batch.Payloads
201201
}
202-
s.cur.checkpointDone = true
203202
s.deliverBatch(prevBatch)
204203
}
205204
}

orderer/sbft/simplebft/simplebft.go

+2
Original file line numberDiff line numberDiff line change
@@ -230,6 +230,8 @@ func (s *SBFT) handleQueueableMessage(m *Msg, src uint64) {
230230
}
231231

232232
func (s *SBFT) deliverBatch(batch *Batch) {
233+
s.cur.checkpointDone = true
234+
s.cur.timeout.Cancel()
233235
s.sys.Deliver(batch)
234236

235237
for _, req := range batch.Payloads {

orderer/sbft/simplebft/simplebft_test.go

+66
Original file line numberDiff line numberDiff line change
@@ -940,6 +940,72 @@ func TestFullBacklog(t *testing.T) {
940940
}
941941
}
942942

943+
func TestHelloMsg(t *testing.T) {
944+
N := uint64(4)
945+
sys := newTestSystemWOTimers(N)
946+
var repls []*SBFT
947+
var adapters []*testSystemAdapter
948+
for i := uint64(0); i < N; i++ {
949+
a := sys.NewAdapter(i)
950+
s, err := New(i, &Config{N: N, F: 1, BatchDurationNsec: 2000000000, BatchSizeBytes: 1, RequestTimeoutNsec: 20000000000}, a)
951+
if err != nil {
952+
t.Fatal(err)
953+
}
954+
repls = append(repls, s)
955+
adapters = append(adapters, a)
956+
}
957+
958+
phase := 1
959+
960+
// We are going to deliver only pre-prepare of the first request to replica 1
961+
// other messages pertaining to first request, destined to 1 will be dropped
962+
sys.filterFn = func(e testElem) (testElem, bool) {
963+
if msg, ok := e.ev.(*testMsgEvent); ok {
964+
if msg.dst == 1 {
965+
c := msg.msg.GetPreprepare()
966+
if c != nil && c.Seq.View == 0 && phase == 1 {
967+
return e, true // letting the first pre-prepare be delivered to 1
968+
}
969+
if phase > 1 {
970+
return e, true //letting msgs outside phase 1 through
971+
}
972+
return e, false //dropping other phase 1 msgs
973+
}
974+
}
975+
return e, true
976+
}
977+
978+
connectAll(sys)
979+
r1 := []byte{1, 2, 3}
980+
repls[0].Request(r1)
981+
sys.Run()
982+
983+
phase = 2 //start delivering msgs to replica 1
984+
985+
testLog.Notice("restarting replica 1")
986+
repls[1], _ = New(1, &Config{N: N, F: 1, BatchDurationNsec: 2000000000, BatchSizeBytes: 10, RequestTimeoutNsec: 20000000000}, adapters[1])
987+
for _, a := range adapters {
988+
if a.id != 1 {
989+
a.receiver.Connection(1)
990+
adapters[1].receiver.Connection(a.id)
991+
}
992+
}
993+
994+
sys.Run()
995+
996+
phase = 3
997+
998+
r3 := []byte{3, 5, 2}
999+
repls[1].Request(r3)
1000+
sys.Run()
1001+
1002+
for _, a := range adapters {
1003+
if len(a.batches) != 2 {
1004+
t.Fatal("expected execution of 2 batches")
1005+
}
1006+
}
1007+
}
1008+
9431009
func TestViewChangeTimer(t *testing.T) {
9441010
N := uint64(4)
9451011
sys := newTestSystem(N)

orderer/sbft/simplebft/testsys_test.go

+19-8
Original file line numberDiff line numberDiff line change
@@ -139,7 +139,9 @@ func (t *testTimer) String() string {
139139

140140
func (t *testSystemAdapter) Timer(d time.Duration, tf func()) Canceller {
141141
tt := &testTimer{id: t.id, tf: tf}
142-
t.sys.enqueue(d, tt)
142+
if !t.sys.disableTimers {
143+
t.sys.enqueue(d, tt)
144+
}
143145
return tt
144146
}
145147

@@ -171,9 +173,8 @@ func (t *testSystemAdapter) Restore(key string, out proto.Message) bool {
171173
func (t *testSystemAdapter) LastBatch() *Batch {
172174
if len(t.batches) == 0 {
173175
return t.receiver.(*SBFT).makeBatch(0, nil, nil)
174-
} else {
175-
return t.batches[len(t.batches)-1]
176176
}
177+
return t.batches[len(t.batches)-1]
177178
}
178179

179180
func (t *testSystemAdapter) Sign(data []byte) []byte {
@@ -233,11 +234,12 @@ type testEvent interface {
233234
// ==============================================
234235

235236
type testSystem struct {
236-
rand *rand.Rand
237-
now time.Duration
238-
queue *calendarQueue
239-
adapters map[uint64]*testSystemAdapter
240-
filterFn func(testElem) (testElem, bool)
237+
rand *rand.Rand
238+
now time.Duration
239+
queue *calendarQueue
240+
adapters map[uint64]*testSystemAdapter
241+
filterFn func(testElem) (testElem, bool)
242+
disableTimers bool
241243
}
242244

243245
type testElem struct {
@@ -257,6 +259,15 @@ func newTestSystem(n uint64) *testSystem {
257259
}
258260
}
259261

262+
func newTestSystemWOTimers(n uint64) *testSystem {
263+
return &testSystem{
264+
rand: rand.New(rand.NewSource(0)),
265+
adapters: make(map[uint64]*testSystemAdapter),
266+
queue: newCalendarQueue(time.Millisecond/time.Duration(n*n), int(n*n)),
267+
disableTimers: true,
268+
}
269+
}
270+
260271
func (t *testSystem) NewAdapter(id uint64) *testSystemAdapter {
261272
key, err := ecdsa.GenerateKey(elliptic.P256(), crand.Reader)
262273
if err != nil {

0 commit comments

Comments
 (0)