Skip to content

Commit 1d8114f

Browse files
author
jyellick
committed
Fix setting of watermark on restore from crash
Change-Id: I9c66cf9ae2c5fa3e0c09cc372453ed15833417aa Signed-off-by: jyellick <[email protected]>
1 parent fdaaff1 commit 1d8114f

File tree

2 files changed

+75
-8
lines changed

2 files changed

+75
-8
lines changed

consensus/pbft/pbft-core_test.go

+67
Original file line numberDiff line numberDiff line change
@@ -1179,6 +1179,73 @@ func TestReplicaCrash3(t *testing.T) {
11791179
}
11801180
}
11811181

1182+
// TestReplicaCrash4 simulates the restart with no checkpoints
1183+
// in the store because they have been garbage collected
1184+
// the bug occurs because the low watermark is incorrectly set to
1185+
// be zero
1186+
func TestReplicaCrash4(t *testing.T) {
1187+
validatorCount := 4
1188+
config := loadConfig()
1189+
config.Set("general.K", 2)
1190+
config.Set("general.logmultiplier", 2)
1191+
net := makePBFTNetwork(validatorCount, config)
1192+
defer net.stop()
1193+
1194+
twoOffline := false
1195+
threeOffline := true
1196+
net.filterFn = func(src int, dst int, msg []byte) []byte {
1197+
if twoOffline && dst == 2 { // 2 is 'offline'
1198+
return nil
1199+
}
1200+
if threeOffline && dst == 3 { // 3 is 'offline'
1201+
return nil
1202+
}
1203+
return msg
1204+
}
1205+
1206+
for i := int64(1); i <= 8; i++ {
1207+
net.pbftEndpoints[0].manager.Queue() <- createPbftReqBatch(i, uint64(generateBroadcaster(validatorCount)))
1208+
}
1209+
net.process() // vp0,1,2 should have a stable checkpoint for seqNo 8
1210+
net.process() // this second time is necessary for garbage collection it seams
1211+
1212+
// Now vp0,1,2 should be in sync with 8 executions in view 0, and vp4 should be offline
1213+
for i, pep := range net.pbftEndpoints {
1214+
1215+
if i == 3 {
1216+
// 3 is offline for this test
1217+
continue
1218+
}
1219+
1220+
if pep.pbft.view != 0 {
1221+
t.Errorf("Expected replica %d to be in view 1, got %d", pep.id, pep.pbft.view)
1222+
}
1223+
1224+
expectedExecutions := uint64(8)
1225+
if pep.sc.executions != expectedExecutions {
1226+
t.Errorf("Expected %d executions on replica %d, got %d", expectedExecutions, pep.id, pep.sc.executions)
1227+
}
1228+
}
1229+
1230+
// Create new pbft instances to restore from persistence
1231+
for id := 0; id < 3; id++ {
1232+
pe := net.pbftEndpoints[id]
1233+
config := loadConfig()
1234+
config.Set("general.K", "2")
1235+
pe.pbft.close()
1236+
pe.pbft = newPbftCore(uint64(id), config, pe.sc, events.NewTimerFactoryImpl(pe.manager))
1237+
pe.manager.SetReceiver(pe.pbft)
1238+
pe.pbft.N = 4
1239+
pe.pbft.f = (4 - 1) / 3
1240+
pe.pbft.requestTimeout = 200 * time.Millisecond
1241+
1242+
expected := uint64(8)
1243+
if pe.pbft.h != expected {
1244+
t.Errorf("Low watermark should have been %d, got %d", expected, pe.pbft.h)
1245+
}
1246+
}
1247+
1248+
}
11821249
func TestReplicaPersistQSet(t *testing.T) {
11831250
persist := make(map[string][]byte)
11841251

consensus/pbft/pbft-persist.go

+8-8
Original file line numberDiff line numberDiff line change
@@ -148,9 +148,11 @@ func (instance *pbftCore) restoreState() {
148148
logger.Warningf("Replica %d could not restore reqBatchStore: %s", instance.id, err)
149149
}
150150

151+
instance.restoreLastSeqNo()
152+
151153
chkpts, err := instance.consumer.ReadStateSet("chkpt.")
152154
if err == nil {
153-
highSeq := uint64(0)
155+
lowWatermark := instance.lastExec // This is safe because we will round down in moveWatermarks
154156
for key, id := range chkpts {
155157
var seqNo uint64
156158
if _, err = fmt.Sscanf(key, "chkpt.%d", &seqNo); err != nil {
@@ -159,20 +161,18 @@ func (instance *pbftCore) restoreState() {
159161
idAsString := base64.StdEncoding.EncodeToString(id)
160162
logger.Debugf("Replica %d found checkpoint %s for seqNo %d", instance.id, idAsString, seqNo)
161163
instance.chkpts[seqNo] = idAsString
162-
if seqNo > highSeq {
163-
highSeq = seqNo
164+
if seqNo < lowWatermark {
165+
lowWatermark = seqNo
164166
}
165167
}
166168
}
167-
instance.moveWatermarks(highSeq)
169+
instance.moveWatermarks(lowWatermark)
168170
} else {
169171
logger.Warningf("Replica %d could not restore checkpoints: %s", instance.id, err)
170172
}
171173

172-
instance.restoreLastSeqNo()
173-
174-
logger.Infof("Replica %d restored state: view: %d, seqNo: %d, pset: %d, qset: %d, reqBatches: %d, chkpts: %d",
175-
instance.id, instance.view, instance.seqNo, len(instance.pset), len(instance.qset), len(instance.reqBatchStore), len(instance.chkpts))
174+
logger.Infof("Replica %d restored state: view: %d, seqNo: %d, pset: %d, qset: %d, reqBatches: %d, chkpts: %d h: %d",
175+
instance.id, instance.view, instance.seqNo, len(instance.pset), len(instance.qset), len(instance.reqBatchStore), len(instance.chkpts), instance.h)
176176
}
177177

178178
func (instance *pbftCore) restoreLastSeqNo() {

0 commit comments

Comments
 (0)