Skip to content

Commit 62d866d

Browse files
author
Jason Yellick
committed
Add flow control to SYNC_STATE_SNAPSHOT
This patch changes the peer snapshot retrieval logic from a non-blocking channel write to a blocking channel write with a timeout. This allows gRPC's own buffering mechanisms to apply back pressure when sending large state snapshots. Additionally, this change will suppress some of the spammy log messages which can flood logs when the correlation id gets out of sync. https://jira.hyperledger.org/browse/FAB-380 Change-Id: Icc3d37f2d161f6ac0bae984ca43e2286a45fbb3d Signed-off-by: Jason Yellick <[email protected]>
1 parent 7b2e488 commit 62d866d

File tree

5 files changed

+50
-11
lines changed

5 files changed

+50
-11
lines changed
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,18 @@
1+
vp0:
2+
environment:
3+
# The combination of the following two environment variables ensures that a state snapshot will be pulled
4+
# and that the state snapshot buffer will be exhausted
5+
- CORE_PEER_SYNC_STATE_SNAPSHOT_CHANNELSIZE=0
6+
- CORE_STATETRANSFER_MAXDELTAS=1
7+
vp1:
8+
environment:
9+
- CORE_PEER_SYNC_STATE_SNAPSHOT_CHANNELSIZE=0
10+
- CORE_STATETRANSFER_MAXDELTAS=1
11+
vp2:
12+
environment:
13+
- CORE_PEER_SYNC_STATE_SNAPSHOT_CHANNELSIZE=0
14+
- CORE_STATETRANSFER_MAXDELTAS=1
15+
vp3:
16+
environment:
17+
- CORE_PEER_SYNC_STATE_SNAPSHOT_CHANNELSIZE=0
18+
- CORE_STATETRANSFER_MAXDELTAS=1

bddtests/peer_basic.feature

+6-3
Original file line numberDiff line numberDiff line change
@@ -505,9 +505,11 @@ Feature: Network of Peers
505505
| docker-compose-4-consensus-batch.yml | 60 |
506506

507507

508+
#@doNotDecompose
509+
#@wip
508510
@issue_680
511+
@fab380
509512
Scenario Outline: chaincode example02 with 4 peers and 1 membersrvc, issue #680 (State transfer)
510-
511513
Given we compose "<ComposeFile>"
512514
And I register with CA supplying username "binhn" and secret "7avZQLwcUe9q" on peers:
513515
| vp0 |
@@ -579,8 +581,9 @@ Feature: Network of Peers
579581

580582

581583
Examples: Consensus Options
582-
| ComposeFile | WaitTime |
583-
| docker-compose-4-consensus-batch.yml | 60 |
584+
| ComposeFile | WaitTime |
585+
| docker-compose-4-consensus-batch.yml | 60 |
586+
| docker-compose-4-consensus-batch.yml docker-compose-4-consensus-batch-nosnapshotbuffer.yml | 60 |
584587

585588

586589
@issue_724

core/peer/handler.go

+19-5
Original file line numberDiff line numberDiff line change
@@ -29,6 +29,8 @@ import (
2929
pb "github.com/hyperledger/fabric/protos"
3030
)
3131

32+
const DefaultSyncSnapshotTimeout time.Duration = 60 * time.Second
33+
3234
// Handler peer handler implementation.
3335
type Handler struct {
3436
chatMutex sync.Mutex
@@ -43,6 +45,8 @@ type Handler struct {
4345
snapshotRequestHandler *syncStateSnapshotRequestHandler
4446
syncStateDeltasRequestHandler *syncStateDeltasHandler
4547
syncBlocksRequestHandler *syncBlocksRequestHandler
48+
syncSnapshotTimeout time.Duration
49+
lastIgnoredSnapshotCID *uint64
4650
}
4751

4852
// NewPeerHandler returns a new Peer handler
@@ -56,6 +60,12 @@ func NewPeerHandler(coord MessageHandlerCoordinator, stream ChatStream, initiate
5660
}
5761
d.doneChan = make(chan struct{})
5862

63+
if dur := viper.GetDuration("peer.sync.state.snapshot.writeTimeout"); dur == 0 {
64+
d.syncSnapshotTimeout = DefaultSyncSnapshotTimeout
65+
} else {
66+
d.syncSnapshotTimeout = dur
67+
}
68+
5969
d.snapshotRequestHandler = newSyncStateSnapshotRequestHandler()
6070
d.syncStateDeltasRequestHandler = newSyncStateDeltasHandler()
6171
d.syncBlocksRequestHandler = newSyncBlocksRequestHandler()
@@ -494,22 +504,26 @@ func (d *Handler) beforeSyncStateSnapshot(e *fsm.Event) {
494504
peerLogger.Errorf("Error sending syncStateSnapshot to channel: %v", x)
495505
}
496506
}()
497-
// Use non-blocking send, will WARN and close channel if missed message.
507+
// Use blocking send and timeout, will WARN and close channel if write times out
498508
d.snapshotRequestHandler.Lock()
499509
defer d.snapshotRequestHandler.Unlock()
510+
timer := time.NewTimer(d.syncSnapshotTimeout)
500511
// Make sure the correlationID matches
501512
if d.snapshotRequestHandler.shouldHandle(syncStateSnapshot.Request.CorrelationId) {
502513
select {
503514
case d.snapshotRequestHandler.channel <- syncStateSnapshot:
504-
default:
515+
case <-timer.C:
505516
// Was not able to write to the channel, in which case the Snapshot stream is incomplete, and must be discarded, closing the channel
506517
// without sending the terminating message which would have had an empty byte slice.
507-
peerLogger.Warningf("Did NOT send SyncStateSnapshot message to channel for correlationId = %d, sequence = %d, closing channel as the message has been discarded", syncStateSnapshot.Request.CorrelationId, syncStateSnapshot.Sequence)
518+
peerLogger.Warningf("Did NOT send SyncStateSnapshot message to channel for correlationId = %d, sequence = %d because we timed out reading, closing channel as the message has been discarded", syncStateSnapshot.Request.CorrelationId, syncStateSnapshot.Sequence)
508519
d.snapshotRequestHandler.reset()
509520
}
510521
} else {
511-
//Ignore the message, does not match the current correlationId
512-
peerLogger.Warningf("Ignoring SyncStateSnapshot message with correlationId = %d, sequence = %d, as current correlationId = %d", syncStateSnapshot.Request.CorrelationId, syncStateSnapshot.Sequence, d.snapshotRequestHandler.correlationID)
522+
if d.lastIgnoredSnapshotCID == nil || *d.lastIgnoredSnapshotCID < syncStateSnapshot.Request.CorrelationId {
523+
peerLogger.Warningf("Ignoring SyncStateSnapshot message with correlationId = %d, sequence = %d, as current correlationId = %d, future messages for this (and older ids) will be suppressed", syncStateSnapshot.Request.CorrelationId, syncStateSnapshot.Sequence, d.snapshotRequestHandler.correlationID)
524+
d.lastIgnoredSnapshotCID = &syncStateSnapshot.Request.CorrelationId
525+
//Ignore the message, does not match the current correlationId
526+
}
513527
}
514528
}
515529

core/peer/statetransfer/statetransfer.go

+1-1
Original file line numberDiff line numberDiff line change
@@ -410,7 +410,7 @@ func (sts *coordinatorImpl) syncBlocks(highBlock, lowBlock uint64, highHash []by
410410

411411
func (sts *coordinatorImpl) syncBlockchainToTarget(blockSyncReq *blockSyncReq) {
412412

413-
logger.Debugf("Processing a blockSyncReq to block %d", blockSyncReq.blockNumber)
413+
logger.Debugf("Processing a blockSyncReq to block %d through %d", blockSyncReq.blockNumber, blockSyncReq.reportOnBlock)
414414

415415
blockchainSize := sts.stack.GetBlockchainSize()
416416

peer/core.yaml

+6-2
Original file line numberDiff line numberDiff line change
@@ -111,9 +111,13 @@ peer:
111111
snapshot:
112112
# Channel size for readonly syncStateSnapshot messages channel
113113
# for receiving state deltas for snapshot from oppositie Peer Endpoints.
114-
# NOTE: currently messages are not stored and forwarded, but
115-
# rather lost if the channel write blocks.
114+
# NOTE: when the channel is exhausted, the writes block for up to the
115+
# writeTimeout specified below
116116
channelSize: 50
117+
# Write timeout for the syncStateSnapshot messages
118+
# When the channel above is exhausted, messages block before being
119+
# discarded for this amount of time
120+
writeTimeout: 60s
117121
deltas:
118122
# Channel size for readonly syncStateDeltas messages channel for
119123
# receiving state deltas for a syncBlockRange from oppositie

0 commit comments

Comments
 (0)