Skip to content

Commit 5dd2e33

Browse files
committed
[FAB-4408] Add retry logic to Chain.Start steps
The following steps take place during the Start call on a Chain object: 1. Creation of Kafka producer. 2. Posting of no-op CONNECT message to partition that corresponds to channel. 3. Creation of Kafka consumer. All of these steps need to succeed before we can have an OSN that can write to and read from a channel. This changeset: 1. Introduces a `retryProcess` concept, where a given function is executed until it succeeds, following a short/long retry logic. 2. Turns each of the steps above into a `retryProcess`. 3. Moves all of the logic under the `Start` method into a goroutine so as to minimize blocking; the expectation of the `multichain.Manager` is that `Chain.Start` returns quickly. 4. Modifies `Enqueue` so that it returns false (which results in a SERVICE_UNAVAILABLE response to the Broadcast call) when all retriable steps above haven't completed successfully. 5. Makes the orderer panic if LongRetryTotal elapses. This changeset also introduces some unit test changes, beyoned the ones warranted by the production path changes described above, the main one being the closing of some client objects that were leaking. This changeset is part of fixing FAB-4136. Change-Id: I2198e020affa56a877ba61a928aae6a45707524d Signed-off-by: Kostas Christidis <[email protected]>
1 parent cd44fba commit 5dd2e33

File tree

7 files changed

+513
-265
lines changed

7 files changed

+513
-265
lines changed

bddtests/features/bootstrap.feature

+9-7
Original file line numberDiff line numberDiff line change
@@ -156,8 +156,10 @@ Feature: Bootstrap
156156

157157
And the user "dev0Org0" using cert alias "consortium1-cert" broadcasts ConfigUpdate Tx "configUpdateTx1" to orderer "<orderer0>" to create channel "com.acme.blockchain.jdoe.Channel1"
158158

159-
# Sleep as the local orderer ledger needs to create the block that corresponds to the start number of the seek request
160-
And I wait "<BroadcastWaitTime>" seconds
159+
# Sleep as the local orderer needs to bring up the resources that correspond to the new channel
160+
# For the Kafka orderer, this includes setting up a producer and consumer for the channel's partition
161+
# Requesting a deliver earlier may result in a SERVICE_UNAVAILABLE response and a connection drop
162+
And I wait "<ChannelJoinDelay>" seconds
161163

162164
When user "dev0Org0" using cert alias "consortium1-cert" connects to deliver function on orderer "<orderer0>"
163165
And user "dev0Org0" sends deliver a seek request on orderer "<orderer0>" with properties:
@@ -342,8 +344,8 @@ Feature: Bootstrap
342344
# TODO: Once events are working, consider listen event listener as well.
343345

344346
Examples: Orderer Options
345-
| ComposeFile | SystemUpWaitTime | ConsensusType | BroadcastWaitTime | orderer0 | orderer1 | orderer2 |Orderer Specific Info|
346-
| dc-base.yml | 0 | solo | 2 | orderer0 | orderer0 | orderer0 | |
347-
# | dc-base.yml dc-peer-couchdb.yml | 10 | solo | 2 | orderer0 | orderer0 | orderer0 | |
348-
# | dc-base.yml dc-orderer-kafka.yml | 30 | kafka | 7 | orderer0 | orderer1 | orderer2 | |
349-
# | dc-base.yml dc-peer-couchdb.yml dc-orderer-kafka.yml | 30 | kafka | 7 | orderer0 | orderer1 | orderer2 | |
347+
| ComposeFile | SystemUpWaitTime | ConsensusType | ChannelJoinDelay | BroadcastWaitTime | orderer0 | orderer1 | orderer2 |Orderer Specific Info|
348+
| dc-base.yml | 0 | solo | 2 | 2 | orderer0 | orderer0 | orderer0 | |
349+
# | dc-base.yml dc-peer-couchdb.yml | 10 | solo | 2 | 2 | orderer0 | orderer0 | orderer0 | |
350+
# | dc-base.yml dc-orderer-kafka.yml | 40 | kafka | 10 | 5 | orderer0 | orderer1 | orderer2 | |
351+
# | dc-base.yml dc-peer-couchdb.yml dc-orderer-kafka.yml | 40 | kafka | 10 | 5 | orderer0 | orderer1 | orderer2 | |

orderer/common/broadcast/broadcast.go

-1
Original file line numberDiff line numberDiff line change
@@ -146,7 +146,6 @@ func (bh *handlerImpl) Handle(srv ab.AtomicBroadcast_BroadcastServer) error {
146146
}
147147

148148
if !support.Enqueue(msg) {
149-
logger.Infof("Consenter instructed us to shut down")
150149
return srv.Send(&ab.BroadcastResponse{Status: cb.Status_SERVICE_UNAVAILABLE})
151150
}
152151

orderer/kafka/chain.go

+92-71
Original file line numberDiff line numberDiff line change
@@ -45,7 +45,9 @@ func newChain(consenter commonConsenter, support multichain.ConsenterSupport, la
4545
channel: newChannel(support.ChainID(), defaultPartition),
4646
lastOffsetPersisted: lastOffsetPersisted,
4747
lastCutBlockNumber: lastCutBlockNumber,
48-
halted: false, // Redundant as the default value for booleans is false but added for readability
48+
started: false, // Redundant as the default value for booleans is false but added for readability
49+
startChan: make(chan struct{}),
50+
halted: false,
4951
exitChan: make(chan struct{}),
5052
}, nil
5153
}
@@ -62,10 +64,12 @@ type chainImpl struct {
6264
parentConsumer sarama.Consumer
6365
channelConsumer sarama.PartitionConsumer
6466

65-
halted bool // For the Enqueue() calls
66-
exitChan chan struct{} // For the Chain's Halt() method
67+
// Set the flag to true and close the channel when the retriable steps in `Start` have completed successfully
68+
started bool
69+
startChan chan struct{}
6770

68-
startCompleted bool // For testing
71+
halted bool
72+
exitChan chan struct{}
6973
}
7074

7175
// Errored currently only closes on halt
@@ -76,47 +80,51 @@ func (chain *chainImpl) Errored() <-chan struct{} {
7680
// Start allocates the necessary resources for staying up to date with this
7781
// Chain. Implements the multichain.Chain interface. Called by
7882
// multichain.NewManagerImpl() which is invoked when the ordering process is
79-
// launched, before the call to NewServer().
83+
// launched, before the call to NewServer(). Launches a goroutine so as not to
84+
// block the multichain.Manager.
8085
func (chain *chainImpl) Start() {
86+
go startThread(chain)
87+
}
88+
89+
// Called by Start().
90+
func startThread(chain *chainImpl) {
8191
var err error
8292

8393
// Set up the producer
84-
chain.producer, err = setupProducerForChannel(chain.support.SharedConfig().KafkaBrokers(), chain.consenter.brokerConfig(), chain.channel, chain.consenter.retryOptions())
94+
chain.producer, err = setupProducerForChannel(chain.consenter.retryOptions(), chain.exitChan, chain.support.SharedConfig().KafkaBrokers(), chain.consenter.brokerConfig(), chain.channel)
8595
if err != nil {
86-
logger.Criticalf("[channel: %s] Cannot set up producer = %s", chain.channel.topic(), err)
87-
close(chain.exitChan)
88-
chain.halted = true
89-
return
96+
logger.Panicf("[channel: %s] Cannot set up producer = %s", chain.channel.topic(), err)
9097
}
9198
logger.Infof("[channel: %s] Producer set up successfully", chain.support.ChainID())
9299

93100
// Have the producer post the CONNECT message
94-
if err = sendConnectMessage(chain.producer, chain.channel); err != nil {
95-
logger.Criticalf("[channel: %s] Cannot post CONNECT message = %s", chain.channel.topic(), err)
96-
close(chain.exitChan)
97-
chain.halted = true
98-
chain.producer.Close()
99-
return
101+
if err = sendConnectMessage(chain.consenter.retryOptions(), chain.exitChan, chain.producer, chain.channel); err != nil {
102+
logger.Panicf("[channel: %s] Cannot post CONNECT message = %s", chain.channel.topic(), err)
100103
}
101104
logger.Infof("[channel: %s] CONNECT message posted successfully", chain.channel.topic())
102105

103-
// Set up the consumer
104-
chain.parentConsumer, chain.channelConsumer, err = setupConsumerForChannel(chain.support.SharedConfig().KafkaBrokers(), chain.consenter.brokerConfig(), chain.channel, chain.lastOffsetPersisted+1)
106+
// Set up the parent consumer
107+
chain.parentConsumer, err = setupParentConsumerForChannel(chain.consenter.retryOptions(), chain.exitChan, chain.support.SharedConfig().KafkaBrokers(), chain.consenter.brokerConfig(), chain.channel)
105108
if err != nil {
106-
logger.Criticalf("[channel: %s] Cannot set up consumer = %s", chain.channel.topic(), err)
107-
close(chain.exitChan)
108-
chain.halted = true
109-
chain.producer.Close()
110-
return
109+
logger.Panicf("[channel: %s] Cannot set up parent consumer = %s", chain.channel.topic(), err)
110+
}
111+
logger.Infof("[channel: %s] Parent consumer set up successfully", chain.channel.topic())
112+
113+
// Set up the channel consumer
114+
chain.channelConsumer, err = setupChannelConsumerForChannel(chain.consenter.retryOptions(), chain.exitChan, chain.parentConsumer, chain.channel, chain.lastOffsetPersisted+1)
115+
if err != nil {
116+
logger.Panicf("[channel: %s] Cannot set up channel consumer = %s", chain.channel.topic(), err)
111117
}
112-
logger.Infof("[channel: %s] Consumer set up successfully", chain.channel.topic())
118+
logger.Infof("[channel: %s] Channel consumer set up successfully", chain.channel.topic())
119+
120+
chain.started = true
121+
close(chain.startChan)
122+
113123
go listenForErrors(chain.channelConsumer.Errors(), chain.exitChan)
114124

115125
// Keep up to date with the channel
116-
go processMessagesToBlock(chain.support, chain.producer, chain.parentConsumer, chain.channelConsumer,
126+
processMessagesToBlock(chain.support, chain.producer, chain.parentConsumer, chain.channelConsumer,
117127
chain.channel, &chain.lastCutBlockNumber, &chain.halted, &chain.exitChan)
118-
119-
chain.startCompleted = true
120128
}
121129

122130
// Halt frees the resources which were allocated for this Chain. Implements the
@@ -130,15 +138,21 @@ func (chain *chainImpl) Halt() {
130138
logger.Warningf("[channel: %s] Halting of chain requested again", chain.support.ChainID())
131139
default:
132140
logger.Criticalf("[channel: %s] Halting of chain requested", chain.support.ChainID())
141+
chain.halted = true
133142
close(chain.exitChan)
134143
}
135144
}
136145

137146
// Enqueue accepts a message and returns true on acceptance, or false on
138147
// shutdown. Implements the multichain.Chain interface. Called by Broadcast.
139148
func (chain *chainImpl) Enqueue(env *cb.Envelope) bool {
149+
if !chain.started {
150+
logger.Warningf("[channel: %s] Will not enqueue because the chain hasn't completed its initialization yet", chain.support.ChainID())
151+
return false
152+
}
153+
140154
if chain.halted {
141-
logger.Warningf("[channel: %s] Will not enqueue cause the chain has been halted", chain.support.ChainID())
155+
logger.Warningf("[channel: %s] Will not enqueue because the chain has been halted", chain.support.ChainID())
142156
return false
143157
}
144158

@@ -382,15 +396,38 @@ func processTimeToCut(ttcMessage *ab.KafkaMessageTimeToCut, support multichain.C
382396
return nil
383397
}
384398

385-
// Post a CONNECT message to the channel. This prevents the panicking that would
386-
// occur if we were to set up a consumer and seek on a partition that hadn't
387-
// been written to yet.
388-
func sendConnectMessage(producer sarama.SyncProducer, channel channel) error {
389-
logger.Infof("[channel: %s] Posting the CONNECT message...", channel.topic())
399+
// Sets up the partition consumer for a channel using the given retry options.
400+
func setupChannelConsumerForChannel(retryOptions localconfig.Retry, exitChan chan struct{}, parentConsumer sarama.Consumer, channel channel, startFrom int64) (sarama.PartitionConsumer, error) {
401+
var err error
402+
var channelConsumer sarama.PartitionConsumer
403+
404+
logger.Infof("[channel: %s] Setting up the channel consumer for this channel...", channel.topic())
405+
406+
retryMsg := "Connecting to the Kafka cluster"
407+
setupChannelConsumer := newRetryProcess(retryOptions, exitChan, channel, retryMsg, func() error {
408+
channelConsumer, err = parentConsumer.ConsumePartition(channel.topic(), channel.partition(), startFrom)
409+
return err
410+
})
411+
412+
return channelConsumer, setupChannelConsumer.retry()
413+
}
414+
415+
// Post a CONNECT message to the channel using the given retry options. This
416+
// prevents the panicking that would occur if we were to set up a consumer and
417+
// seek on a partition that hadn't been written to yet.
418+
func sendConnectMessage(retryOptions localconfig.Retry, exitChan chan struct{}, producer sarama.SyncProducer, channel channel) error {
419+
logger.Infof("[channel: %s] About to post the CONNECT message...", channel.topic())
420+
390421
payload := utils.MarshalOrPanic(newConnectMessage())
391422
message := newProducerMessage(channel, payload)
392-
_, _, err := producer.SendMessage(message)
393-
return err
423+
424+
retryMsg := "Attempting to post the CONNECT message..."
425+
postConnect := newRetryProcess(retryOptions, exitChan, channel, retryMsg, func() error {
426+
_, _, err := producer.SendMessage(message)
427+
return err
428+
})
429+
430+
return postConnect.retry()
394431
}
395432

396433
func sendTimeToCut(producer sarama.SyncProducer, channel channel, timeToCutBlockNumber uint64, timer *<-chan time.Time) error {
@@ -402,50 +439,34 @@ func sendTimeToCut(producer sarama.SyncProducer, channel channel, timeToCutBlock
402439
return err
403440
}
404441

405-
// Sets up the listener/consumer for a channel.
406-
func setupConsumerForChannel(brokers []string, brokerConfig *sarama.Config, channel channel, startFrom int64) (sarama.Consumer, sarama.PartitionConsumer, error) {
407-
logger.Infof("[channel: %s] Setting up the consumer for this channel...", channel.topic())
442+
// Sets up the parent consumer for a channel using the given retry options.
443+
func setupParentConsumerForChannel(retryOptions localconfig.Retry, exitChan chan struct{}, brokers []string, brokerConfig *sarama.Config, channel channel) (sarama.Consumer, error) {
444+
var err error
445+
var parentConsumer sarama.Consumer
408446

409-
parentConsumer, err := sarama.NewConsumer(brokers, brokerConfig)
410-
if err != nil {
411-
return nil, nil, err
412-
}
413-
logger.Debugf("[channel: %s] Created new parent consumer", channel.topic())
447+
logger.Infof("[channel: %s] Setting up the parent consumer for this channel...", channel.topic())
414448

415-
channelConsumer, err := parentConsumer.ConsumePartition(channel.topic(), channel.partition(), startFrom)
416-
if err != nil {
417-
_ = parentConsumer.Close()
418-
return nil, nil, err
419-
}
420-
logger.Debugf("[channel: %s] Created new channel consumer", channel.topic())
449+
retryMsg := "Connecting to the Kafka cluster"
450+
setupParentConsumer := newRetryProcess(retryOptions, exitChan, channel, retryMsg, func() error {
451+
parentConsumer, err = sarama.NewConsumer(brokers, brokerConfig)
452+
return err
453+
})
421454

422-
return parentConsumer, channelConsumer, nil
455+
return parentConsumer, setupParentConsumer.retry()
423456
}
424457

425-
// Sets up the writer/producer for a channel.
426-
func setupProducerForChannel(brokers []string, brokerConfig *sarama.Config, channel channel, retryOptions localconfig.Retry) (sarama.SyncProducer, error) {
458+
// Sets up the writer/producer for a channel using the given retry options.
459+
func setupProducerForChannel(retryOptions localconfig.Retry, exitChan chan struct{}, brokers []string, brokerConfig *sarama.Config, channel channel) (sarama.SyncProducer, error) {
427460
var err error
428461
var producer sarama.SyncProducer
429462

430-
// This will be revised in: https://jira.hyperledger.org/browse/FAB-4136
431-
repeatTick := time.NewTicker(retryOptions.ShortInterval)
432-
panicTick := time.NewTicker(retryOptions.ShortTotal)
433-
logger.Debugf("[channel: %s] Retrying every %s for a total of %s", channel.topic(), retryOptions.ShortInterval.String(), retryOptions.ShortTotal.String())
434-
defer repeatTick.Stop()
435-
defer panicTick.Stop()
463+
logger.Infof("[channel: %s] Setting up the producer for this channel...", channel.topic())
436464

437-
loop:
438-
for {
439-
select {
440-
case <-panicTick.C:
441-
return nil, err
442-
case <-repeatTick.C:
443-
logger.Debugf("[channel: %s] Connecting to Kafka cluster: %s", channel.topic(), brokers)
444-
if producer, err = sarama.NewSyncProducer(brokers, brokerConfig); err == nil {
445-
break loop
446-
}
447-
}
448-
}
465+
retryMsg := "Connecting to the Kafka cluster"
466+
setupProducer := newRetryProcess(retryOptions, exitChan, channel, retryMsg, func() error {
467+
producer, err = sarama.NewSyncProducer(brokers, brokerConfig)
468+
return err
469+
})
449470

450-
return producer, err
471+
return producer, setupProducer.retry()
451472
}

0 commit comments

Comments
 (0)