Skip to content

Commit bb6bc8d

Browse files
committed
sbft: fix restart bug and test
commit 24514a027377631cc4b3c53a56d7fbcad75b7e5b Author: Simon Schubert <[email protected]> Date: Wed Oct 26 18:59:33 2016 +0200 sbft: resend in-flight messages on reconnect This allows a recently disconnected replica to catch up and cancel its request timer. Change-Id: I6e32176777fcf74e48bd2a6402793cf4712637d5 Signed-off-by: Simon Schubert <[email protected]> commit 08232f825ae54569c8fd296717a8ad242d9ad84d Author: Simon Schubert <[email protected]> Date: Wed Oct 26 18:59:19 2016 +0200 sbft: ignore duplicate preprepare messages Change-Id: Ibfb47ecf1e005bdd23ed40f3753b01f49dfffd94 Signed-off-by: Simon Schubert <[email protected]> commit 8ba9a9548727c36b8e276003d9a0eb855b2a6afb Author: Simon Schubert <[email protected]> Date: Wed Oct 26 18:56:40 2016 +0200 sbft: do not use non-deterministic map iteration in tests This change hides a bug that occasionally was exposed due to a specific map iteration order. We add TestErroneousViewChange to explicitly test for this bug, which is fixed in a followup commit. Change-Id: I192903dd57d637ec000a628f9f61b44749caf602 Signed-off-by: Simon Schubert <[email protected]> Change-Id: I32a3c424dd385704b77ed884b2ec0f6d04f6ac6c Signed-off-by: Simon Schubert <[email protected]>
1 parent 4274764 commit bb6bc8d

File tree

5 files changed

+149
-13
lines changed

5 files changed

+149
-13
lines changed

consensus/simplebft/checkpoint.go

+6-2
Original file line numberDiff line numberDiff line change
@@ -21,14 +21,18 @@ import (
2121
"reflect"
2222
)
2323

24-
func (s *SBFT) sendCheckpoint() {
24+
func (s *SBFT) makeCheckpoint() *Checkpoint {
2525
sig := s.sys.Sign(s.cur.subject.Digest)
2626
c := &Checkpoint{
2727
Seq: s.cur.subject.Seq.Seq,
2828
Digest: s.cur.subject.Digest,
2929
Signature: sig,
3030
}
31-
s.broadcast(&Msg{&Msg_Checkpoint{c}})
31+
return c
32+
}
33+
34+
func (s *SBFT) sendCheckpoint() {
35+
s.broadcast(&Msg{&Msg_Checkpoint{s.makeCheckpoint()}})
3236
}
3337

3438
func (s *SBFT) handleCheckpoint(c *Checkpoint, src uint64) {

consensus/simplebft/connection.go

+28-2
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,8 @@ limitations under the License.
1616

1717
package simplebft
1818

19+
import "github.com/golang/protobuf/proto"
20+
1921
// Connection is an event from system to notify a new connection with
2022
// replica.
2123
// On connection, we send our latest (weak) checkpoint, and we expect
@@ -25,8 +27,6 @@ func (s *SBFT) Connection(replica uint64) {
2527
batch.Payloads = nil // don't send the big payload
2628
s.sys.Send(&Msg{&Msg_Hello{&batch}}, replica)
2729

28-
// TODO
29-
//
3030
// A reconnecting replica can play forward its blockchain to
3131
// the batch listed in the hello message. However, the
3232
// currently in-flight batch will not be reflected in the
@@ -36,6 +36,32 @@ func (s *SBFT) Connection(replica uint64) {
3636
// Therefore we also send the most recent (pre)prepare,
3737
// commit, checkpoint so that the reconnecting replica can
3838
// catch up on the in-flight batch.
39+
//
40+
// TODO We need to communicate the latest view to the
41+
// connecting replica. The new view message is not signed, so
42+
// we cannot send that message. The worst corner case is
43+
// connecting right after a new-view message was received, and
44+
// its xset batch is in-flight.
45+
46+
batchheader := &BatchHeader{}
47+
err := proto.Unmarshal(batch.Header, batchheader)
48+
if err != nil {
49+
panic(err)
50+
}
51+
52+
if s.cur.subject.Seq.Seq > batchheader.Seq {
53+
if s.isPrimary() {
54+
s.sys.Send(&Msg{&Msg_Preprepare{s.cur.preprep}}, replica)
55+
} else {
56+
s.sys.Send(&Msg{&Msg_Prepare{&s.cur.subject}}, replica)
57+
}
58+
if s.cur.sentCommit {
59+
s.sys.Send(&Msg{&Msg_Commit{&s.cur.subject}}, replica)
60+
}
61+
if s.cur.executed {
62+
s.sys.Send(&Msg{&Msg_Checkpoint{s.makeCheckpoint()}}, replica)
63+
}
64+
}
3965
}
4066

4167
func (s *SBFT) handleHello(h *Batch, src uint64) {

consensus/simplebft/preprepare.go

+4
Original file line numberDiff line numberDiff line change
@@ -50,6 +50,10 @@ func (s *SBFT) handlePreprepare(pp *Preprepare, src uint64) {
5050
log.Infof("preprepare does not match expected %v, got %v", nextSeq, *pp.Seq)
5151
return
5252
}
53+
if s.cur.subject.Seq.Seq == pp.Seq.Seq {
54+
log.Infof("duplicate preprepare for %v", *pp.Seq)
55+
return
56+
}
5357
var blockhash []byte
5458
if pp.Batch != nil {
5559
blockhash = hash(pp.Batch.Header)

consensus/simplebft/simplebft.go

+1-2
Original file line numberDiff line numberDiff line change
@@ -129,17 +129,16 @@ func New(id uint64, config *Config, sys System) (*SBFT, error) {
129129
s.seq = *pp.Seq
130130
s.seq.Seq -= 1
131131
s.handlePreprepare(pp, s.primaryIDView(pp.Seq.View))
132+
// ideally we wouldn't send a prepare here
132133
}
133134
}
134135
c := &Subject{}
135136
if s.sys.Restore("commit", c) && reflect.DeepEqual(c, &s.cur.subject) {
136137
s.cur.sentCommit = true
137-
s.sendCommit()
138138
}
139139
ex := &Subject{}
140140
if s.sys.Restore("execute", ex) && reflect.DeepEqual(c, &s.cur.subject) {
141141
s.cur.executed = true
142-
s.sendCheckpoint()
143142
}
144143

145144
// XXX set active after checking with the network

consensus/simplebft/simplebft_test.go

+110-7
Original file line numberDiff line numberDiff line change
@@ -34,8 +34,25 @@ func init() {
3434
}
3535

3636
func connectAll(sys *testSystem) {
37+
// map iteration is non-deterministic, so use linear iteration instead
38+
max := uint64(0)
3739
for _, a := range sys.adapters {
38-
for _, b := range sys.adapters {
40+
if a.id > max {
41+
max = a.id
42+
}
43+
}
44+
45+
for i := uint64(0); i <= max; i++ {
46+
a, ok := sys.adapters[i]
47+
if !ok {
48+
continue
49+
}
50+
51+
for j := uint64(0); j <= max; j++ {
52+
b, ok := sys.adapters[j]
53+
if !ok {
54+
continue
55+
}
3956
if a.id != b.id {
4057
a.receiver.Connection(b.id)
4158
}
@@ -342,7 +359,7 @@ func TestRestart(t *testing.T) {
342359

343360
testLog.Notice("restarting 0")
344361
repls[0], _ = New(0, &Config{N: N, F: 1, BatchDurationNsec: 2000000000, BatchSizeBytes: 10, RequestTimeoutNsec: 20000000000}, adapters[0])
345-
for _, a := range sys.adapters {
362+
for _, a := range adapters {
346363
if a.id != 0 {
347364
a.receiver.Connection(0)
348365
adapters[0].receiver.Connection(a.id)
@@ -394,7 +411,7 @@ func TestRestartAfterPrepare(t *testing.T) {
394411
if p := msg.msg.GetPrepare(); p != nil && p.Seq.Seq == 3 && !restarted {
395412
restarted = true
396413
repls[0], _ = New(0, &Config{N: N, F: 1, BatchDurationNsec: 2000000000, BatchSizeBytes: 10, RequestTimeoutNsec: 20000000000}, adapters[0])
397-
for _, a := range sys.adapters {
414+
for _, a := range adapters {
398415
if a.id != 0 {
399416
a.receiver.Connection(0)
400417
adapters[0].receiver.Connection(a.id)
@@ -462,7 +479,7 @@ func TestRestartAfterCommit(t *testing.T) {
462479
restarted = true
463480
testLog.Notice("restarting 0")
464481
repls[0], _ = New(0, &Config{N: N, F: 1, BatchDurationNsec: 2000000000, BatchSizeBytes: 10, RequestTimeoutNsec: 20000000000}, adapters[0])
465-
for _, a := range sys.adapters {
482+
for _, a := range adapters {
466483
if a.id != 0 {
467484
a.receiver.Connection(0)
468485
adapters[0].receiver.Connection(a.id)
@@ -503,8 +520,6 @@ func TestRestartAfterCommit(t *testing.T) {
503520
}
504521

505522
func TestRestartAfterCheckpoint(t *testing.T) {
506-
// TODO re-enable this test after https://jira.hyperledger.org/browse/FAB-624 has been resolved
507-
t.Skip()
508523
N := uint64(4)
509524
sys := newTestSystem(N)
510525
var repls []*SBFT
@@ -532,7 +547,7 @@ func TestRestartAfterCheckpoint(t *testing.T) {
532547
restarted = true
533548
testLog.Notice("restarting 0")
534549
repls[0], _ = New(0, &Config{N: N, F: 1, BatchDurationNsec: 2000000000, BatchSizeBytes: 10, RequestTimeoutNsec: 20000000000}, adapters[0])
535-
for _, a := range sys.adapters {
550+
for _, a := range adapters {
536551
if a.id != 0 {
537552
a.receiver.Connection(0)
538553
adapters[0].receiver.Connection(a.id)
@@ -571,3 +586,91 @@ func TestRestartAfterCheckpoint(t *testing.T) {
571586
}
572587
}
573588
}
589+
590+
func TestErroneousViewChange(t *testing.T) {
591+
N := uint64(4)
592+
sys := newTestSystem(N)
593+
var repls []*SBFT
594+
var adapters []*testSystemAdapter
595+
for i := uint64(0); i < N; i++ {
596+
a := sys.NewAdapter(i)
597+
s, err := New(i, &Config{N: N, F: 1, BatchDurationNsec: 2000000000, BatchSizeBytes: 10, RequestTimeoutNsec: 20000000000}, a)
598+
if err != nil {
599+
t.Fatal(err)
600+
}
601+
repls = append(repls, s)
602+
adapters = append(adapters, a)
603+
}
604+
605+
restarted := false
606+
607+
// network outage after prepares are received
608+
sys.filterFn = func(e testElem) (testElem, bool) {
609+
if msg, ok := e.ev.(*testMsgEvent); ok {
610+
if msg.src == msg.dst || msg.src != 0 {
611+
return e, true
612+
}
613+
614+
if c := msg.msg.GetCheckpoint(); c != nil && c.Seq == 3 && !restarted {
615+
restarted = true
616+
testLog.Notice("restarting 0")
617+
repls[0], _ = New(0, &Config{N: N, F: 1, BatchDurationNsec: 2000000000, BatchSizeBytes: 10, RequestTimeoutNsec: 20000000000}, adapters[0])
618+
for _, a := range adapters {
619+
if a.id != 0 {
620+
a.receiver.Connection(0)
621+
adapters[0].receiver.Connection(a.id)
622+
}
623+
}
624+
}
625+
}
626+
627+
return e, true
628+
}
629+
630+
// iteration order here is essential to trigger the bug
631+
outer := []uint64{2, 3, 0, 1}
632+
inner := []uint64{0, 1, 2, 3}
633+
for _, i := range outer {
634+
a, ok := sys.adapters[i]
635+
if !ok {
636+
continue
637+
}
638+
639+
for _, j := range inner {
640+
b, ok := sys.adapters[j]
641+
if !ok {
642+
continue
643+
}
644+
if a.id != b.id {
645+
a.receiver.Connection(b.id)
646+
}
647+
}
648+
}
649+
sys.Run()
650+
651+
// move to view 1
652+
for _, r := range repls {
653+
r.sendViewChange()
654+
}
655+
656+
r1 := []byte{1, 2, 3}
657+
repls[0].Request(r1)
658+
sys.Run()
659+
660+
r2 := []byte{3, 1, 2}
661+
r3 := []byte{3, 5, 2}
662+
repls[1].Request(r2)
663+
repls[1].Request(r3)
664+
sys.Run()
665+
for _, a := range adapters {
666+
if len(a.batches) != 3 {
667+
t.Fatal("expected execution of 3 batches")
668+
}
669+
if !reflect.DeepEqual([][]byte{r1}, a.batches[1].Payloads) {
670+
t.Error("wrong request executed (1)")
671+
}
672+
if !reflect.DeepEqual([][]byte{r2, r3}, a.batches[2].Payloads) {
673+
t.Error("wrong request executed (2)")
674+
}
675+
}
676+
}

0 commit comments

Comments
 (0)