From 1730d75f7304f67134aed669f5d053eef4c72482 Mon Sep 17 00:00:00 2001 From: Xiaoxi Wang Date: Tue, 21 Sep 2021 16:49:55 -0700 Subject: [PATCH] change configure test add store type check add test file --- .gitignore | 2 +- .../sphinx/source/command-line-interface.rst | 3 +- fdbcli/ConfigureCommand.actor.cpp | 21 +++--- fdbcli/fdbcli.actor.cpp | 1 - fdbclient/ManagementAPI.actor.h | 12 ++-- fdbrpc/simulator.h | 2 + fdbserver/DataDistribution.actor.cpp | 53 +++++++++++---- fdbserver/QuietDatabase.actor.cpp | 12 ++-- fdbserver/SimulatedCluster.actor.cpp | 23 +++++-- .../workloads/ConfigureDatabase.actor.cpp | 68 +++++++++++++++++-- tests/CMakeLists.txt | 1 + tests/slow/ConfigureStorageMigrationTest.toml | 20 ++++++ 12 files changed, 168 insertions(+), 50 deletions(-) create mode 100644 tests/slow/ConfigureStorageMigrationTest.toml diff --git a/.gitignore b/.gitignore index 270c631eed..b486706077 100644 --- a/.gitignore +++ b/.gitignore @@ -9,7 +9,7 @@ bindings/java/foundationdb-tests*.jar bindings/java/fdb-java-*-sources.jar packaging/msi/FDBInstaller.msi build/ -cmake-build-debug/ +cmake-build-debug* # Generated source, build, and packaging files *.g.cpp *.g.h diff --git a/documentation/sphinx/source/command-line-interface.rst b/documentation/sphinx/source/command-line-interface.rst index b96c17be61..7617b5ad15 100644 --- a/documentation/sphinx/source/command-line-interface.rst +++ b/documentation/sphinx/source/command-line-interface.rst @@ -118,7 +118,7 @@ storage migration type ^^^^^^^^^^^^^^^^^^^^^^ Set the storage migration type, or how FDB should migrate to a new storage engine if the value is changed. -The default is ``disabled``, which means changing the storage engine will not be possible. +The default is ``disabled``, which means changing the storage engine will not be possible. * ``disabled`` * ``gradual`` @@ -128,7 +128,6 @@ The default is ``disabled``, which means changing the storage engine will not be ``aggressive`` tries to replace as many storages as it can at once, and will recruit a new storage server on the same process as the old one. This will be faster, but can potentially hit degraded performance or OOM with two storages on the same process. The main benefit over ``gradual`` is that this doesn't need to take one storage out of rotation, so it works for small or development clusters that have the same number of storage processes as the replication factor. Note that ``aggressive`` is not exclusive to running the perpetual wiggle. ``disabled`` means that if the storage engine is changed, fdb will not move the cluster over to the new storage engine. This will disable the perpetual wiggle from rewriting storage files. - consistencycheck ---------------- diff --git a/fdbcli/ConfigureCommand.actor.cpp b/fdbcli/ConfigureCommand.actor.cpp index a13efaac16..fc2383f574 100644 --- a/fdbcli/ConfigureCommand.actor.cpp +++ b/fdbcli/ConfigureCommand.actor.cpp @@ -173,6 +173,16 @@ ACTOR Future configureCommandActor(Reference db, fprintf(stderr, "ERROR: These changes would make the configuration invalid\n"); ret = false; break; + case ConfigurationResult::STORAGE_MIGRATION_DISABLED: + fprintf(stderr, + "ERROR: Storage engine type cannot be changed because " + "storage_migration_mode=disabled.\n"); + fprintf(stderr, + "Type `configure perpetual_storage_wiggle=1 storage_migration_type=gradual' to enable gradual " + "migration with the perpetual wiggle, or `configure " + "storage_migration_type=aggressive' for aggressive migration.\n"); + ret = true; + break; case ConfigurationResult::DATABASE_ALREADY_CREATED: fprintf(stderr, "ERROR: Database already exists! To change configuration, don't say `new'\n"); ret = false; @@ -240,17 +250,6 @@ ACTOR Future configureCommandActor(Reference db, "storage_migration_type=gradual' to set the gradual migration type.\n"); ret = false; break; - case ConfigurationResult::SUCCESS_WARN_CHANGE_STORAGE_NOMIGRATE: - printf("Configuration changed, with warnings\n"); - fprintf(stderr, - "WARN: Storage engine type changed, but nothing will be migrated because " - "storage_migration_mode=disabled.\n"); - fprintf(stderr, - "Type `configure perpetual_storage_wiggle=1 storage_migration_type=gradual' to enable gradual " - "migration with the perpetual wiggle, or `configure " - "storage_migration_type=aggressive' for aggressive migration.\n"); - ret = false; - break; default: ASSERT(false); ret = false; diff --git a/fdbcli/fdbcli.actor.cpp b/fdbcli/fdbcli.actor.cpp index 88418a8855..2cf1946bf0 100644 --- a/fdbcli/fdbcli.actor.cpp +++ b/fdbcli/fdbcli.actor.cpp @@ -632,7 +632,6 @@ ACTOR Future commitTransaction(Reference tr) { } // FIXME: Factor address parsing from coordinators, include, exclude - ACTOR Future coordinators(Database db, std::vector tokens, bool isClusterTLS) { state StringRef setName; StringRef nameTokenBegin = LiteralStringRef("description="); diff --git a/fdbclient/ManagementAPI.actor.h b/fdbclient/ManagementAPI.actor.h index f722c91217..021dfde2bf 100644 --- a/fdbclient/ManagementAPI.actor.h +++ b/fdbclient/ManagementAPI.actor.h @@ -49,6 +49,7 @@ enum class ConfigurationResult { UNKNOWN_OPTION, INCOMPLETE_CONFIGURATION, INVALID_CONFIGURATION, + STORAGE_MIGRATION_DISABLED, DATABASE_ALREADY_CREATED, DATABASE_CREATED, DATABASE_UNAVAILABLE, @@ -61,7 +62,6 @@ enum class ConfigurationResult { DCID_MISSING, LOCKED_NOT_NEW, SUCCESS_WARN_PPW_GRADUAL, - SUCCESS_WARN_CHANGE_STORAGE_NOMIGRATE, SUCCESS, }; @@ -569,11 +569,9 @@ Future changeConfig(Reference db, std::map 0 && - newConfig.storageMigrationType == StorageMigrationType::DISABLED)) { + return ConfigurationResult::STORAGE_MIGRATION_DISABLED; + } else if (newConfig.storageMigrationType == StorageMigrationType::GRADUAL && + newConfig.perpetualStorageWiggleSpeed == 0) { warnPPWGradual = true; } } @@ -636,8 +634,6 @@ Future changeConfig(Reference db, std::mapglobal(id); }; void setGlobal(size_t id, flowGlobalType v) final { getCurrentProcess()->setGlobal(id, v); }; diff --git a/fdbserver/DataDistribution.actor.cpp b/fdbserver/DataDistribution.actor.cpp index c6d68da20e..81e4fe9a2d 100644 --- a/fdbserver/DataDistribution.actor.cpp +++ b/fdbserver/DataDistribution.actor.cpp @@ -2500,6 +2500,7 @@ struct DDTeamCollection : ReferenceCounted { TraceEvent(newServer.isTss() ? "AddedTSS" : "AddedStorageServer", distributorId) .detail("ServerID", newServer.id()) + .detail("ProcessID", newServer.locality.processId()) .detail("ProcessClass", processClass.toString()) .detail("WaitFailureToken", newServer.waitFailure.getEndpoint().token) .detail("Address", newServer.waitFailure.getEndpoint().getPrimaryAddress()); @@ -3246,7 +3247,11 @@ ACTOR Future removeWrongStoreType(DDTeamCollection* self) { // Server may be removed due to failure while the wrongStoreTypeToRemove is sent to the // storageServerTracker. This race may cause the server to be removed before react to // wrongStoreTypeToRemove - server.second->wrongStoreTypeToRemove.set(true); + if (self->configuration.storageMigrationType == StorageMigrationType::AGGRESSIVE) { + // if the Storage Migration type is aggressive, let DD remove SS with wrong storage type + server.second->wrongStoreTypeToRemove.set(true); + } + // Otherwise, wait Perpetual Wiggler to wiggle the SS with wrong storage type foundSSToRemove = true; TraceEvent("WrongStoreTypeRemover", self->distributorId) .detail("Server", server.first) @@ -3962,10 +3967,12 @@ ACTOR Future>> getSe ACTOR Future updateNextWigglingStoragePID(DDTeamCollection* teamCollection) { state ReadYourWritesTransaction tr(teamCollection->cx); state Value writeValue; + state const Key writeKey = + wigglingStorageServerKey.withSuffix(teamCollection->primary ? "/primary"_sr : "/remote"_sr); loop { try { tr.setOption(FDBTransactionOptions::ACCESS_SYSTEM_KEYS); - Optional value = wait(tr.get(wigglingStorageServerKey)); + Optional value = wait(tr.get(writeKey)); if (teamCollection->pid2server_info.empty()) { writeValue = LiteralStringRef(""); } else { @@ -3981,7 +3988,7 @@ ACTOR Future updateNextWigglingStoragePID(DDTeamCollection* teamCollection writeValue = pid; } } - tr.set(wigglingStorageServerKey, writeValue); + tr.set(writeKey, writeValue); wait(tr.commit()); break; } catch (Error& e) { @@ -3989,6 +3996,7 @@ ACTOR Future updateNextWigglingStoragePID(DDTeamCollection* teamCollection } } TraceEvent(SevDebug, "PerpetualNextWigglingStoragePID", teamCollection->distributorId) + .detail("Primary", teamCollection->primary) .detail("WriteValue", writeValue); return Void(); @@ -4010,6 +4018,15 @@ ACTOR Future perpetualStorageWiggleIterator(AsyncVar* stopSignal, // there must not have other teams to place wiggled data takeRest = teamCollection->server_info.size() <= teamCollection->configuration.storageTeamSize || teamCollection->machine_info.size() < teamCollection->configuration.storageTeamSize; + teamCollection->doBuildTeams = true; + if (takeRest && + teamCollection->configuration.storageMigrationType == StorageMigrationType::GRADUAL) { + TraceEvent(SevWarn, "PerpetualWiggleSleep", teamCollection->distributorId) + .suppressFor(SERVER_KNOBS->PERPETUAL_WIGGLE_DELAY * 4) + .detail("ServerSize", teamCollection->server_info.size()) + .detail("MachineSize", teamCollection->machine_info.size()) + .detail("StorageTeamSize", teamCollection->configuration.storageTeamSize); + } } wait(updateNextWigglingStoragePID(teamCollection)); } @@ -4028,14 +4045,16 @@ ACTOR Future, Value>> watchPerpetualStoragePIDChange(DDTe state ReadYourWritesTransaction tr(self->cx); state Future watchFuture; state Value ret; + state const Key readKey = wigglingStorageServerKey.withSuffix(self->primary ? "/primary"_sr : "/remote"_sr); + loop { try { tr.setOption(FDBTransactionOptions::ACCESS_SYSTEM_KEYS); - Optional value = wait(tr.get(wigglingStorageServerKey)); + Optional value = wait(tr.get(readKey)); if (value.present()) { ret = value.get(); } - watchFuture = tr.watch(wigglingStorageServerKey); + watchFuture = tr.watch(readKey); wait(tr.commit()); break; } catch (Error& e) { @@ -4096,7 +4115,13 @@ ACTOR Future perpetualStorageWiggler(AsyncVar* stopSignal, TEST(true); // paused because cluster is unhealthy moveFinishFuture = Never(); self->includeStorageServersForWiggle(); - TraceEvent("PerpetualStorageWigglePause", self->distributorId) + self->doBuildTeams = true; + + TraceEvent(self->configuration.storageMigrationType == StorageMigrationType::AGGRESSIVE ? SevInfo + : SevWarn, + "PerpetualStorageWigglePause", + self->distributorId) + .detail("Primary", self->primary) .detail("ProcessId", pid) .detail("BestTeamKeepStuckCount", self->bestTeamKeepStuckCount) .detail("ExtraHealthyTeamCount", extraTeamCount) @@ -4108,6 +4133,7 @@ ACTOR Future perpetualStorageWiggler(AsyncVar* stopSignal, movingCount = fv.size(); moveFinishFuture = waitForAll(fv); TraceEvent("PerpetualStorageWiggleStart", self->distributorId) + .detail("Primary", self->primary) .detail("ProcessId", pid) .detail("ExtraHealthyTeamCount", extraTeamCount) .detail("HealthyTeamCount", self->healthyTeamCount) @@ -4134,6 +4160,7 @@ ACTOR Future perpetualStorageWiggler(AsyncVar* stopSignal, moveFinishFuture = Never(); self->includeStorageServersForWiggle(); TraceEvent("PerpetualStorageWiggleFinish", self->distributorId) + .detail("Primary", self->primary) .detail("ProcessId", pid.toString()) .detail("StorageCount", movingCount); @@ -4153,6 +4180,7 @@ ACTOR Future perpetualStorageWiggler(AsyncVar* stopSignal, if (self->wigglingPid.present()) { self->includeStorageServersForWiggle(); TraceEvent("PerpetualStorageWiggleExitingPause", self->distributorId) + .detail("Primary", self->primary) .detail("ProcessId", self->wigglingPid.get()); self->wigglingPid.reset(); } @@ -4190,14 +4218,16 @@ ACTOR Future monitorPerpetualStorageWiggle(DDTeamCollection* teamCollectio &stopWiggleSignal, finishStorageWiggleSignal.getFuture(), teamCollection)); collection.add( perpetualStorageWiggler(&stopWiggleSignal, finishStorageWiggleSignal, teamCollection)); - TraceEvent("PerpetualStorageWiggleOpen", teamCollection->distributorId).log(); + TraceEvent("PerpetualStorageWiggleOpen", teamCollection->distributorId) + .detail("Primary", teamCollection->primary); } else if (speed == 0) { if (!stopWiggleSignal.get()) { stopWiggleSignal.set(true); wait(collection.signalAndReset()); teamCollection->pauseWiggle->set(true); } - TraceEvent("PerpetualStorageWiggleClose", teamCollection->distributorId).log(); + TraceEvent("PerpetualStorageWiggleClose", teamCollection->distributorId) + .detail("Primary", teamCollection->primary); } wait(watchFuture); break; @@ -4653,6 +4683,7 @@ ACTOR Future storageServerTracker( if (worstStatus == DDTeamCollection::Status::WIGGLING && !isTss) { status.isWiggling = true; TraceEvent("PerpetualWigglingStorageServer", self->distributorId) + .detail("Primary", self->primary) .detail("Server", server->id) .detail("ProcessId", server->lastKnownInterface.locality.processId()) .detail("Address", worstAddr.toString()); @@ -5598,10 +5629,7 @@ ACTOR Future dataDistributionTeamCollection( self->addActor.send(trackExcludedServers(self)); self->addActor.send(monitorHealthyTeams(self)); self->addActor.send(waitHealthyZoneChange(self)); - - if (self->primary) { // the primary dc also handle the satellite dc's perpetual wiggling - self->addActor.send(monitorPerpetualStorageWiggle(self)); - } + self->addActor.send(monitorPerpetualStorageWiggle(self)); // SOMEDAY: Monitor FF/serverList for (new) servers that aren't in allServers and add or remove them loop choose { @@ -5641,6 +5669,7 @@ ACTOR Future dataDistributionTeamCollection( .detail("StorageTeamSize", self->configuration.storageTeamSize) .detail("HighestPriority", highestPriority) .trackLatest(self->primary ? "TotalDataInFlight" : "TotalDataInFlightRemote"); + loggingTrigger = delay(SERVER_KNOBS->DATA_DISTRIBUTION_LOGGING_INTERVAL, TaskPriority::FlushTrace); } when(wait(self->serverTrackerErrorOut.getFuture())) {} // Propagate errors from storageServerTracker diff --git a/fdbserver/QuietDatabase.actor.cpp b/fdbserver/QuietDatabase.actor.cpp index 1c8b2a3916..97b9211cd1 100644 --- a/fdbserver/QuietDatabase.actor.cpp +++ b/fdbserver/QuietDatabase.actor.cpp @@ -631,22 +631,22 @@ ACTOR Future waitForQuietDatabase(Database cx, state Future storageQueueSize; state Future dataDistributionActive; state Future storageServersRecruiting; - auto traceMessage = "QuietDatabase" + phase + "Begin"; - TraceEvent(traceMessage.c_str()); + TraceEvent(traceMessage.c_str()).log(); // In a simulated environment, wait 5 seconds so that workers can move to their optimal locations if (g_network->isSimulated()) wait(delay(5.0)); + // The quiet database check (which runs at the end of every test) will always time out due to active data movement. // To get around this, quiet Database will disable the perpetual wiggle in the setup phase. + printf("Set perpetual_storage_wiggle=0 ...\n"); wait(setPerpetualStorageWiggle(cx, false, LockAware::True)); printf("Set perpetual_storage_wiggle=0 Done.\n"); // Require 3 consecutive successful quiet database checks spaced 2 second apart state int numSuccesses = 0; - loop { try { TraceEvent("QuietDatabaseWaitingOnDataDistributor").log(); @@ -686,15 +686,15 @@ ACTOR Future waitForQuietDatabase(Database cx, if (dataInFlight.get() > dataInFlightGate || tLogQueueInfo.get().first > maxTLogQueueGate || tLogQueueInfo.get().second > maxPoppedVersionLag || dataDistributionQueueSize.get() > maxDataDistributionQueueSize || - storageQueueSize.get() > maxStorageServerQueueGate || dataDistributionActive.get() == false || - storageServersRecruiting.get() == true || teamCollectionValid.get() == false) { + storageQueueSize.get() > maxStorageServerQueueGate || !dataDistributionActive.get() || + storageServersRecruiting.get() || !teamCollectionValid.get()) { wait(delay(1.0)); numSuccesses = 0; } else { if (++numSuccesses == 3) { auto msg = "QuietDatabase" + phase + "Done"; - TraceEvent(msg.c_str()); + TraceEvent(msg.c_str()).log(); break; } else { wait(delay(g_network->isSimulated() ? 2.0 : 30.0)); diff --git a/fdbserver/SimulatedCluster.actor.cpp b/fdbserver/SimulatedCluster.actor.cpp index 64055a377e..e0b1278131 100644 --- a/fdbserver/SimulatedCluster.actor.cpp +++ b/fdbserver/SimulatedCluster.actor.cpp @@ -285,6 +285,7 @@ public: int maxTLogVersion = TLogVersion::MAX_SUPPORTED; // Set true to simplify simulation configs for easier debugging bool simpleConfig = false; + int extraMachineCountDC = 0; Optional generateFearless, buggify; Optional datacenters, desiredTLogCount, commitProxyCount, grvProxyCount, resolverCount, storageEngineType, stderrSeverity, machineCount, processesPerMachine, coordinators; @@ -338,7 +339,8 @@ public: .add("machineCount", &machineCount) .add("processesPerMachine", &processesPerMachine) .add("coordinators", &coordinators) - .add("configDB", &configDBType); + .add("configDB", &configDBType) + .add("extraMachineCountDC", &extraMachineCountDC); try { auto file = toml::parse(testFile); if (file.contains("configuration") && toml::find(file, "configuration").is_table()) { @@ -1248,7 +1250,7 @@ void SimulationConfig::setRandomConfig() { set_config("perpetual_storage_wiggle=0"); } else { // TraceEvent("SimulatedConfigRandom").detail("PerpetualWiggle", 1); - set_config("storage_migration_type=gradual perpetual_storage_wiggle=1"); + set_config("perpetual_storage_wiggle=1"); } if (deterministicRandom()->random01() < 0.5) { @@ -1655,6 +1657,7 @@ void SimulationConfig::setMachineCount(const TestConfig& testConfig) { machine_count = std::max(machine_count, deterministicRandom()->randomInt(5, extraDB ? 6 : 10)); } } + machine_count += datacenters * testConfig.extraMachineCountDC; } // Sets the coordinator count based on the testConfig. May be overwritten later @@ -1693,7 +1696,7 @@ void SimulationConfig::setTss(const TestConfig& testConfig) { // reduce tss to half of extra non-seed servers that can be recruited in usable regions. tssCount = - std::max(0, std::min(tssCount, (db.usableRegions * (machine_count / datacenters) - replication_type) / 2)); + std::max(0, std::min(tssCount, db.usableRegions * ((machine_count / datacenters) - db.storageTeamSize) / 2)); if (!testConfig.config.present() && tssCount > 0) { std::string confStr = format("tss_count:=%d tss_storage_engine:=%d", tssCount, db.storageServerStoreType); @@ -1980,6 +1983,7 @@ void setupSimulatedSystem(std::vector>* systemActors, bool requiresExtraDBMachines = testConfig.extraDB && g_simulator.extraDB->toString() != conn.toString(); int assignedMachines = 0, nonVersatileMachines = 0; + bool gradualMigrationPossible = true; std::vector processClassesSubSet = { ProcessClass::UnsetClass, ProcessClass::StatelessClass }; for (int dc = 0; dc < dataCenters; dc++) { @@ -1988,6 +1992,7 @@ void setupSimulatedSystem(std::vector>* systemActors, std::vector machineIdentities; int machines = machineCount / dataCenters + (dc < machineCount % dataCenters); // add remainder of machines to first datacenter + int possible_ss = 0; int dcCoordinators = coordinatorCount / dataCenters + (dc < coordinatorCount % dataCenters); printf("Datacenter %d: %d/%d machines, %d/%d coordinators\n", dc, @@ -2028,8 +2033,12 @@ void setupSimulatedSystem(std::vector>* systemActors, processClass = ProcessClass((ProcessClass::ClassType)deterministicRandom()->randomInt(0, 3), ProcessClass::CommandLineSource); // Unset, Storage, or Transaction if (processClass == - ProcessClass::StatelessClass) // *can't* be assigned to other roles, even in an emergency + ProcessClass::StatelessClass) { // *can't* be assigned to other roles, even in an emergency nonVersatileMachines++; + } + if (processClass == ProcessClass::UnsetClass || processClass == ProcessClass::StorageClass) { + possible_ss++; + } } // FIXME: temporarily code to test storage cache @@ -2097,6 +2106,10 @@ void setupSimulatedSystem(std::vector>* systemActors, assignedMachines++; } + + if (possible_ss - simconfig.db.desiredTSSCount / simconfig.db.usableRegions <= simconfig.db.storageTeamSize) { + gradualMigrationPossible = false; + } } g_simulator.desiredCoordinators = coordinatorCount; @@ -2144,6 +2157,7 @@ void setupSimulatedSystem(std::vector>* systemActors, // save some state that we only need when restarting the simulator. g_simulator.connectionString = conn.toString(); g_simulator.testerCount = testerCount; + g_simulator.allowStorageMigrationTypeChange = gradualMigrationPossible; TraceEvent("SimulatedClusterStarted") .detail("DataCenters", dataCenters) @@ -2152,6 +2166,7 @@ void setupSimulatedSystem(std::vector>* systemActors, .detail("SSLEnabled", sslEnabled) .detail("SSLOnly", sslOnly) .detail("ClassesAssigned", assignClasses) + .detail("GradualMigrationPossible", gradualMigrationPossible) .detail("StartingConfiguration", pStartingConfiguration->toString()); } diff --git a/fdbserver/workloads/ConfigureDatabase.actor.cpp b/fdbserver/workloads/ConfigureDatabase.actor.cpp index 46a93b0ad8..bec84f5bbe 100644 --- a/fdbserver/workloads/ConfigureDatabase.actor.cpp +++ b/fdbserver/workloads/ConfigureDatabase.actor.cpp @@ -25,12 +25,17 @@ #include "fdbserver/Knobs.h" #include "fdbserver/workloads/workloads.actor.h" #include "fdbrpc/simulator.h" +#include "fdbserver/QuietDatabase.h" #include "flow/actorcompiler.h" // This must be the last #include. // "ssd" is an alias to the preferred type which skews the random distribution toward it but that's okay. static const char* storeTypes[] = { "ssd", "ssd-1", "ssd-2", "memory", "memory-1", "memory-2", "memory-radixtree-beta" }; +static const char* storageMigrationTypes[] = { "perpetual_storage_wiggle=0 storage_migration_type=aggressive", + "perpetual_storage_wiggle=1", + "perpetual_storage_wiggle=1 storage_migration_type=gradual", + "storage_migration_type=aggressive" }; static const char* logTypes[] = { "log_engine:=1", "log_engine:=2", "log_spill:=1", "log_spill:=2", "log_version:=2", "log_version:=3", "log_version:=4", "log_version:=5", "log_version:=6" }; @@ -214,6 +219,7 @@ struct ConfigureDatabaseWorkload : TestWorkload { double testDuration; int additionalDBs; bool allowDescriptorChange; + bool allowTestStorageMigration; std::vector> clients; PerfIntCounter retries; @@ -221,7 +227,8 @@ struct ConfigureDatabaseWorkload : TestWorkload { testDuration = getOption(options, LiteralStringRef("testDuration"), 200.0); allowDescriptorChange = getOption(options, LiteralStringRef("allowDescriptorChange"), SERVER_KNOBS->ENABLE_CROSS_CLUSTER_SUPPORT); - + allowTestStorageMigration = + getOption(options, "allowTestStorageMigration"_sr, false) && g_simulator.allowStorageMigrationTypeChange; g_simulator.usableRegions = 1; } @@ -230,7 +237,7 @@ struct ConfigureDatabaseWorkload : TestWorkload { Future setup(Database const& cx) override { return _setup(cx, this); } Future start(Database const& cx) override { return _start(this, cx); } - Future check(Database const& cx) override { return true; } + Future check(Database const& cx) override { return _check(this, cx); } void getMetrics(std::vector& m) override { m.push_back(retries.getMetric()); } @@ -250,7 +257,7 @@ struct ConfigureDatabaseWorkload : TestWorkload { } ACTOR Future _setup(Database cx, ConfigureDatabaseWorkload* self) { - wait(success(ManagementAPI::changeConfig(cx.getReference(), "single", true))); + wait(success(ManagementAPI::changeConfig(cx.getReference(), "single storage_migration_type=aggressive", true))); return Void(); } @@ -262,6 +269,44 @@ struct ConfigureDatabaseWorkload : TestWorkload { return Void(); } + ACTOR Future _check(ConfigureDatabaseWorkload* self, Database cx) { + // only storage_migration_type=gradual && perpetual_storage_wiggle=1 need this check because in QuietDatabase + // perpetual wiggle will be forced to close For other cases, later ConsistencyCheck will check KV store type + // there + if (self->allowTestStorageMigration) { + state DatabaseConfiguration conf = wait(getDatabaseConfiguration(cx)); + state int i; + loop { + state bool pass = true; + state std::vector storageServers = wait(getStorageServers(cx)); + + for (i = 0; i < storageServers.size(); i++) { + // Check that each storage server has the correct key value store type + if (!storageServers[i].isTss()) { + ReplyPromise typeReply; + ErrorOr keyValueStoreType = + wait(storageServers[i].getKeyValueStoreType.getReplyUnlessFailedFor(typeReply, 2, 0)); + if (keyValueStoreType.present() && keyValueStoreType.get() != conf.storageServerStoreType) { + TraceEvent(SevWarn, "ConfigureDatabase_WrongStoreType") + .suppressFor(5.0) + .detail("ServerID", storageServers[i].id()) + .detail("ProcessID", storageServers[i].locality.processId()) + .detail("ServerStoreType", + keyValueStoreType.present() ? keyValueStoreType.get().toString() : "?") + .detail("ConfigStoreType", conf.storageServerStoreType.toString()); + pass = false; + break; + } + } + } + if (pass) + break; + wait(delay(g_network->isSimulated() ? 2.0 : 30.0)); + } + } + return true; + } + static int randomRoleNumber() { int i = deterministicRandom()->randomInt(0, 4); return i ? i : -1; @@ -273,8 +318,12 @@ struct ConfigureDatabaseWorkload : TestWorkload { if (g_simulator.speedUpSimulation) { return Void(); } - state int randomChoice = deterministicRandom()->randomInt(0, 8); - + state int randomChoice; + if (self->allowTestStorageMigration) { + randomChoice = deterministicRandom()->randomInt(4, 9); + } else { + randomChoice = deterministicRandom()->randomInt(0, 8); + } if (randomChoice == 0) { wait(success( runRYWTransaction(cx, [=](Reference tr) -> Future> { @@ -345,6 +394,15 @@ struct ConfigureDatabaseWorkload : TestWorkload { cx, backupTypes[deterministicRandom()->randomInt(0, sizeof(backupTypes) / sizeof(backupTypes[0]))], false))); + } else if (randomChoice == 8) { + if (self->allowTestStorageMigration) { + TEST(true); // storage migration type change + wait(success(IssueConfigurationChange( + cx, + storageMigrationTypes[deterministicRandom()->randomInt( + 0, sizeof(storageMigrationTypes) / sizeof(storageMigrationTypes[0]))], + false))); + } } else { ASSERT(false); } diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt index 0bb917f32a..92464ddf54 100644 --- a/tests/CMakeLists.txt +++ b/tests/CMakeLists.txt @@ -259,6 +259,7 @@ if(WITH_PYTHON) add_fdb_test(TEST_FILES slow/CloggedStorefront.toml) add_fdb_test(TEST_FILES slow/CommitBug.toml) add_fdb_test(TEST_FILES slow/ConfigureTest.toml) + add_fdb_test(TEST_FILES slow/ConfigureStorageMigrationTest.toml) add_fdb_test(TEST_FILES slow/CycleRollbackPlain.toml) add_fdb_test(TEST_FILES slow/DDBalanceAndRemove.toml) add_fdb_test(TEST_FILES slow/DDBalanceAndRemoveStatus.toml) diff --git a/tests/slow/ConfigureStorageMigrationTest.toml b/tests/slow/ConfigureStorageMigrationTest.toml new file mode 100644 index 0000000000..9d9219f0f6 --- /dev/null +++ b/tests/slow/ConfigureStorageMigrationTest.toml @@ -0,0 +1,20 @@ +[configuration] +extraMachineCountDC = 2 + +[[test]] +testTitle = 'CloggedConfigureDatabaseTest' + + [[test.workload]] + testName = 'ConfigureDatabase' + testDuration = 300.0 + allowTestStorageMigration = true + + [[test.workload]] + testName = 'RandomClogging' + testDuration = 300.0 + + [[test.workload]] + testName = 'RandomClogging' + testDuration = 300.0 + scale = 0.1 + clogginess = 2.0