Skip to content

Commit 37948e5

Browse files
committed
Fix consumer might not subscribe after a reconnection
Fixes #436 ### Motivation When a consumer starts grabbing the connection, it registers a timer after the operation timeout. When that timer is expired, it will fail the connection and cancel the connection timer. However, it results a race condition that: 1. The consumer's connection is closed (e.g. the keep alive timer failed) 2. The connection timer is registered on the executor and will trigger the reconnection after 100ms 3. The connection timer is cancelled, then the reconnection won't start. ### Modifications Cancel the `creationTimer_` once `HandlerBase#start` succeeded first time. Add `testReconnectWhenFirstConnectTimedOut` to cover this case.
1 parent 35bf161 commit 37948e5

File tree

5 files changed

+75
-6
lines changed

5 files changed

+75
-6
lines changed

lib/ClientConnection.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -404,6 +404,7 @@ class PULSAR_PUBLIC ClientConnection : public std::enable_shared_from_this<Clien
404404
const size_t poolIndex_;
405405

406406
friend class PulsarFriend;
407+
friend class ConsumerTest;
407408

408409
void checkServerError(ServerError error, const std::string& message);
409410

lib/HandlerBase.cc

Lines changed: 17 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,9 @@
1818
*/
1919
#include "HandlerBase.h"
2020

21+
#include <chrono>
22+
23+
#include "AsioDefines.h"
2124
#include "Backoff.h"
2225
#include "ClientConnection.h"
2326
#include "ClientImpl.h"
@@ -63,6 +66,7 @@ void HandlerBase::start() {
6366
creationTimer_->async_wait([this, weakSelf](const ASIO_ERROR& error) {
6467
auto self = weakSelf.lock();
6568
if (self && !error) {
69+
LOG_WARN("Cancel the pending reconnection due to the start timeout");
6670
connectionFailed(ResultTimeout);
6771
ASIO_ERROR ignored;
6872
timer_->cancel(ignored);
@@ -118,13 +122,21 @@ void HandlerBase::grabCnx(const boost::optional<std::string>& assignedBrokerUrl)
118122
}
119123
auto self = shared_from_this();
120124
auto cnxFuture = getConnection(client, assignedBrokerUrl);
121-
cnxFuture.addListener([this, self](Result result, const ClientConnectionPtr& cnx) {
125+
using namespace std::chrono;
126+
auto before = high_resolution_clock::now();
127+
cnxFuture.addListener([this, self, before](Result result, const ClientConnectionPtr& cnx) {
122128
if (result == ResultOk) {
123-
LOG_DEBUG(getName() << "Connected to broker: " << cnx->cnxString());
124-
connectionOpened(cnx).addListener([this, self](Result result, bool) {
129+
connectionOpened(cnx).addListener([this, self, before](Result result, bool) {
125130
// Do not use bool, only Result.
126131
reconnectionPending_ = false;
127-
if (result != ResultOk && isResultRetryable(result)) {
132+
if (result == ResultOk) {
133+
connectionTimeMs_ =
134+
duration_cast<milliseconds>(high_resolution_clock::now() - before).count();
135+
// Prevent the creationTimer_ from cancelling the timer_ in future
136+
ASIO_ERROR ignored;
137+
creationTimer_->cancel(ignored);
138+
LOG_INFO("Finished connecting to broker after " << connectionTimeMs_ << " ms")
139+
} else if (isResultRetryable(result)) {
128140
scheduleReconnection();
129141
}
130142
});
@@ -194,8 +206,7 @@ void HandlerBase::scheduleReconnection(const boost::optional<std::string>& assig
194206

195207
void HandlerBase::handleTimeout(const ASIO_ERROR& ec, const boost::optional<std::string>& assignedBrokerUrl) {
196208
if (ec) {
197-
LOG_DEBUG(getName() << "Ignoring timer cancelled event, code[" << ec << "]");
198-
return;
209+
LOG_INFO(getName() << "Ignoring timer cancelled event, code[" << ec << "]");
199210
} else {
200211
epoch_++;
201212
grabCnx(assignedBrokerUrl);

lib/HandlerBase.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -157,9 +157,11 @@ class HandlerBase : public std::enable_shared_from_this<HandlerBase> {
157157
ClientConnectionWeakPtr connection_;
158158
std::string redirectedClusterURI_;
159159
std::atomic<long> firstRequestIdAfterConnect_{-1L};
160+
std::atomic<long> connectionTimeMs_{0}; // only for tests
160161

161162
friend class ClientConnection;
162163
friend class PulsarFriend;
164+
friend class ConsumerTest;
163165
};
164166
} // namespace pulsar
165167
#endif //_PULSAR_HANDLER_BASE_HEADER_

tests/ConsumerTest.cc

Lines changed: 32 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,8 @@
1616
* specific language governing permissions and limitations
1717
* under the License.
1818
*/
19+
#include "ConsumerTest.h"
20+
1921
#include <gtest/gtest.h>
2022
#include <pulsar/Client.h>
2123

@@ -26,6 +28,7 @@
2628
#include <map>
2729
#include <mutex>
2830
#include <set>
31+
#include <string>
2932
#include <thread>
3033
#include <vector>
3134

@@ -1467,4 +1470,33 @@ TEST(ConsumerTest, testMultiConsumerListenerAndAck) {
14671470
client.close();
14681471
}
14691472

1473+
// When a consumer starts grabbing the connection, it registers a timer after the operation timeout. When that
1474+
// timer is expired, it will fail the connection and cancel the connection timer. However, it results a race
1475+
// condition that:
1476+
// 1. The consumer's connection is closed (e.g. the keep alive timer failed)
1477+
// 2. The connection timer is registered on the executor and will trigger the reconnection after 100ms
1478+
// 3. The connection timer is cancelled, then the reconnection won't start.
1479+
TEST(ConsumerTest, testReconnectWhenFirstConnectTimedOut) {
1480+
ClientConfiguration conf;
1481+
conf.setOperationTimeoutSeconds(1);
1482+
Client client{lookupUrl, conf};
1483+
1484+
auto topic = "consumer-test-reconnect-when-first-connect-timed-out" + std::to_string(time(nullptr));
1485+
Consumer consumer;
1486+
ASSERT_EQ(ResultOk, client.subscribe(topic, "sub", consumer));
1487+
1488+
auto timer = ConsumerTest::scheduleCloseConnection(consumer, std::chrono::seconds(1));
1489+
ASSERT_TRUE(timer != nullptr);
1490+
timer->wait();
1491+
1492+
Producer producer;
1493+
ASSERT_EQ(ResultOk, client.createProducer(topic, producer));
1494+
ASSERT_EQ(ResultOk, producer.send(MessageBuilder().setContent("msg").build()));
1495+
1496+
Message msg;
1497+
ASSERT_EQ(ResultOk, consumer.receive(msg, 3000));
1498+
ASSERT_EQ("msg", msg.getDataAsString());
1499+
client.close();
1500+
}
1501+
14701502
} // namespace pulsar

tests/ConsumerTest.h

Lines changed: 23 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -16,9 +16,14 @@
1616
* specific language governing permissions and limitations
1717
* under the License.
1818
*/
19+
#include <chrono>
20+
#include <memory>
21+
#include <stdexcept>
1922
#include <string>
2023

24+
#include "lib/ClientConnection.h"
2125
#include "lib/ConsumerImpl.h"
26+
#include "lib/ExecutorService.h"
2227

2328
using std::string;
2429

@@ -28,5 +33,23 @@ class ConsumerTest {
2833
static int getNumOfMessagesInQueue(const Consumer& consumer) {
2934
return consumer.impl_->getNumOfPrefetchedMessages();
3035
}
36+
37+
template <typename T>
38+
static DeadlineTimerPtr scheduleCloseConnection(const Consumer& consumer, T delaySinceStartGrabCnx) {
39+
auto impl = std::dynamic_pointer_cast<ConsumerImpl>(consumer.impl_);
40+
if (!impl) {
41+
throw std::runtime_error("scheduleCloseConnection can only be called on ConsumerImpl");
42+
}
43+
44+
auto cnx = impl->getCnx().lock();
45+
if (!cnx) {
46+
return nullptr;
47+
}
48+
auto timer = cnx->executor_->createDeadlineTimer();
49+
timer->expires_from_now(delaySinceStartGrabCnx -
50+
std::chrono::milliseconds(impl->connectionTimeMs_ + 50));
51+
timer->async_wait([cnx](const ASIO_ERROR&) { cnx->close(); });
52+
return timer;
53+
}
3154
};
3255
} // namespace pulsar

0 commit comments

Comments
 (0)