diff --git a/documentation/sphinx/source/release-notes/release-notes-630.rst b/documentation/sphinx/source/release-notes/release-notes-630.rst index 44db8d8a77..4f51bc273f 100644 --- a/documentation/sphinx/source/release-notes/release-notes-630.rst +++ b/documentation/sphinx/source/release-notes/release-notes-630.rst @@ -2,6 +2,11 @@ Release Notes ############# +6.3.18 +====== +* The multi-version client API would not propagate errors that occurred when creating databases on external clients. This could result in a invalid memory accesses. `(PR #5221) `_ +* Fixed a race between the multi-version client connecting to a cluster and destroying the database that could cause an assertion failure. `(PR #5221) `_ + 6.3.15 ====== diff --git a/documentation/sphinx/source/release-notes/release-notes-700.rst b/documentation/sphinx/source/release-notes/release-notes-700.rst index ddb9e11ed1..cfc0730e90 100644 --- a/documentation/sphinx/source/release-notes/release-notes-700.rst +++ b/documentation/sphinx/source/release-notes/release-notes-700.rst @@ -60,6 +60,8 @@ Fixes * Added a new pre-backup action when creating a backup. Backups can now either verify the range data is being saved to is empty before the backup begins (current behavior) or clear the range where data is being saved to. Fixes a ``restore_destination_not_empty`` failure after a backup retry due to ``commit_unknown_failure``. `(PR #4595) `_ * When configured with ``usable_regions=2``, a cluster would not fail over to a region which contained only storage class processes. `(PR #4599) `_ * If a restore is done using a prefix to remove and specific key ranges to restore, the key range boundaries must begin with the prefix to remove. `(PR #4684) `_ +* The multi-version client API would not propagate errors that occurred when creating databases on external clients. This could result in a invalid memory accesses. `(PR #5220) `_ +* Fixed a race between the multi-version client connecting to a cluster and destroying the database that could cause an assertion failure. `(PR #5220) `_ Status ------ diff --git a/fdbclient/MultiVersionTransaction.actor.cpp b/fdbclient/MultiVersionTransaction.actor.cpp index afc92bfec0..eedacf80aa 100644 --- a/fdbclient/MultiVersionTransaction.actor.cpp +++ b/fdbclient/MultiVersionTransaction.actor.cpp @@ -591,7 +591,7 @@ Reference DLApi::createDatabase609(const char* clusterFilePath) { Reference DLApi::createDatabase(const char* clusterFilePath) { if (headerVersion >= 610) { FdbCApi::FDBDatabase* db; - api->createDatabase(clusterFilePath, &db); + throwIfError(api->createDatabase(clusterFilePath, &db)); return Reference(new DLDatabase(api, db)); } else { return DLApi::createDatabase609(clusterFilePath); @@ -895,22 +895,43 @@ MultiVersionDatabase::MultiVersionDatabase(MultiVersionApi* api, api->runOnExternalClients(threadIdx, [this](Reference client) { dbState->addClient(client); }); - if (!externalClientsInitialized.test_and_set()) { - api->runOnExternalClientsAllThreads([&clusterFilePath](Reference client) { - // This creates a database to initialize some client state on the external library - // We only do this on 6.2+ clients to avoid some bugs associated with older versions - // This deletes the new database immediately to discard its connections - if (client->protocolVersion.hasCloseUnusedConnection()) { + api->runOnExternalClientsAllThreads([&clusterFilePath](Reference client) { + // This creates a database to initialize some client state on the external library. + // We only do this on 6.2+ clients to avoid some bugs associated with older versions. + // This deletes the new database immediately to discard its connections. + // + // Simultaneous attempts to create a database could result in us running this initialization + // code in multiple threads simultaneously. It is necessary that each attempt have a chance + // to run this initialization in case the other fails, and it's safe to run them in parallel. + if (client->protocolVersion.hasCloseUnusedConnection() && !client->initialized) { + try { Reference newDb = client->api->createDatabase(clusterFilePath.c_str()); + client->initialized = true; + } catch (Error& e) { + // This connection is not initialized. It is still possible to connect with it, + // but we may not see trace logs from this client until a successful connection + // is established. + TraceEvent(SevWarnAlways, "FailedToInitializeExternalClient") + .detail("LibraryPath", client->libPath) + .detail("ClusterFilePath", clusterFilePath) + .error(e); } - }); - } + } + }); // For clients older than 6.2 we create and maintain our database connection api->runOnExternalClients(threadIdx, [this, &clusterFilePath](Reference client) { if (!client->protocolVersion.hasCloseUnusedConnection()) { - dbState->legacyDatabaseConnections[client->protocolVersion] = - client->api->createDatabase(clusterFilePath.c_str()); + try { + dbState->legacyDatabaseConnections[client->protocolVersion] = + client->api->createDatabase(clusterFilePath.c_str()); + } catch (Error& e) { + // This connection is discarded + TraceEvent(SevWarnAlways, "FailedToCreateLegacyDatabaseConnection") + .detail("LibraryPath", client->libPath) + .detail("ClusterFilePath", clusterFilePath) + .error(e); + } } }); @@ -994,7 +1015,7 @@ ThreadFuture MultiVersionDatabase::getServerProtocol(Optional

versionMonitorDb) : clusterFilePath(clusterFilePath), versionMonitorDb(versionMonitorDb), - dbVar(new ThreadSafeAsyncVar>(Reference(nullptr))) {} + dbVar(new ThreadSafeAsyncVar>(Reference(nullptr))), closed(false) {} // Adds a client (local or externally loaded) that can be used to connect to the cluster void MultiVersionDatabase::DatabaseState::addClient(Reference client) { @@ -1058,6 +1079,10 @@ ThreadFuture MultiVersionDatabase::DatabaseState::monitorProtocolVersion() // Called when a change to the protocol version of the cluster has been detected. // Must be called from the main thread void MultiVersionDatabase::DatabaseState::protocolVersionChanged(ProtocolVersion protocolVersion) { + if (closed) { + return; + } + // If the protocol version changed but is still compatible, update our local version but keep the same connection if (dbProtocolVersion.present() && protocolVersion.normalizedVersion() == dbProtocolVersion.get().normalizedVersion()) { @@ -1084,7 +1109,20 @@ void MultiVersionDatabase::DatabaseState::protocolVersionChanged(ProtocolVersion .detail("Failed", client->failed) .detail("External", client->external); - Reference newDb = client->api->createDatabase(clusterFilePath.c_str()); + Reference newDb; + try { + newDb = client->api->createDatabase(clusterFilePath.c_str()); + } catch (Error& e) { + TraceEvent(SevWarnAlways, "MultiVersionClientFailedToCreateDatabase") + .detail("LibraryPath", client->libPath) + .detail("External", client->external) + .detail("ClusterFilePath", clusterFilePath) + .error(e); + + // Put the client in a disconnected state until the version changes again + updateDatabase(Reference(), Reference()); + return; + } if (client->external && !MultiVersionApi::apiVersionAtLeast(610)) { // Old API versions return a future when creating the database, so we need to wait for it @@ -1112,6 +1150,10 @@ void MultiVersionDatabase::DatabaseState::protocolVersionChanged(ProtocolVersion // Replaces the active database connection with a new one. Must be called from the main thread. void MultiVersionDatabase::DatabaseState::updateDatabase(Reference newDb, Reference client) { + if (closed) { + return; + } + if (newDb) { optionLock.enter(); for (auto option : options) { @@ -1143,12 +1185,28 @@ void MultiVersionDatabase::DatabaseState::updateDatabase(Reference ne versionMonitorDb = db; } else { // For older clients that don't have an API to get the protocol version, we have to monitor it locally - versionMonitorDb = MultiVersionApi::api->getLocalClient()->api->createDatabase(clusterFilePath.c_str()); + try { + versionMonitorDb = MultiVersionApi::api->getLocalClient()->api->createDatabase(clusterFilePath.c_str()); + } catch (Error& e) { + // We can't create a new database to monitor the cluster version. This means we will continue using the + // previous one, which should hopefully continue to work. + TraceEvent(SevWarnAlways, "FailedToCreateDatabaseForVersionMonitoring") + .detail("ClusterFilePath", clusterFilePath) + .error(e); + } } } else { // We don't have a database connection, so use the local client to monitor the protocol version db = Reference(); - versionMonitorDb = MultiVersionApi::api->getLocalClient()->api->createDatabase(clusterFilePath.c_str()); + try { + versionMonitorDb = MultiVersionApi::api->getLocalClient()->api->createDatabase(clusterFilePath.c_str()); + } catch (Error& e) { + // We can't create a new database to monitor the cluster version. This means we will continue using the + // previous one, which should hopefully continue to work. + TraceEvent(SevWarnAlways, "FailedToCreateDatabaseForVersionMonitoring") + .detail("ClusterFilePath", clusterFilePath) + .error(e); + } } dbVar->set(db); @@ -1178,6 +1236,7 @@ void MultiVersionDatabase::DatabaseState::close() { Reference self = Reference::addRef(this); onMainThreadVoid( [self]() { + self->closed = true; if (self->protocolVersionMonitor.isValid()) { self->protocolVersionMonitor.cancel(); } @@ -1255,8 +1314,6 @@ void MultiVersionDatabase::LegacyVersionMonitor::close() { } } -std::atomic_flag MultiVersionDatabase::externalClientsInitialized = ATOMIC_FLAG_INIT; - // MultiVersionApi bool MultiVersionApi::apiVersionAtLeast(int minVersion) { ASSERT_NE(MultiVersionApi::api->apiVersion, 0); diff --git a/fdbclient/MultiVersionTransaction.h b/fdbclient/MultiVersionTransaction.h index 65892a8dbf..274df7dd84 100644 --- a/fdbclient/MultiVersionTransaction.h +++ b/fdbclient/MultiVersionTransaction.h @@ -417,12 +417,15 @@ struct ClientInfo : ClientDesc, ThreadSafeReferenceCounted { ProtocolVersion protocolVersion; IClientApi* api; bool failed; + std::atomic_bool initialized; std::vector> threadCompletionHooks; - ClientInfo() : ClientDesc(std::string(), false), protocolVersion(0), api(nullptr), failed(true) {} - ClientInfo(IClientApi* api) : ClientDesc("internal", false), protocolVersion(0), api(api), failed(false) {} + ClientInfo() + : ClientDesc(std::string(), false), protocolVersion(0), api(nullptr), failed(true), initialized(false) {} + ClientInfo(IClientApi* api) + : ClientDesc("internal", false), protocolVersion(0), api(api), failed(false), initialized(false) {} ClientInfo(IClientApi* api, std::string libPath) - : ClientDesc(libPath, true), protocolVersion(0), api(api), failed(false) {} + : ClientDesc(libPath, true), protocolVersion(0), api(api), failed(false), initialized(false) {} void loadProtocolVersion(); bool canReplace(Reference other) const; @@ -504,10 +507,9 @@ public: // this will be a specially created local db. Reference versionMonitorDb; + bool closed; + ThreadFuture changed; - - bool cancelled; - ThreadFuture dbReady; ThreadFuture protocolVersionMonitor; @@ -557,10 +559,6 @@ public: const Reference dbState; friend class MultiVersionTransaction; - - // Clients must create a database object in order to initialize some of their state. - // This needs to be done only once, and this flag tracks whether that has happened. - static std::atomic_flag externalClientsInitialized; }; // An implementation of IClientApi that can choose between multiple different client implementations either provided