From 3c86643822f23164198828456218a7d1fdc8da8d Mon Sep 17 00:00:00 2001 From: Jingyu Zhou Date: Thu, 14 Feb 2019 16:24:46 -0800 Subject: [PATCH 01/46] Separate Ratekeeper from data distribution. Add a new role for ratekeeper. Remove StorageServerChanges from data distribution. Ratekeeper monitors storage servers, which borrows the idea from DataDistribution. --- fdbrpc/Locality.cpp | 23 +++++ fdbrpc/Locality.h | 7 +- fdbrpc/simulator.h | 1 + fdbserver/CMakeLists.txt | 1 + fdbserver/ClusterController.actor.cpp | 92 ++++++++++++++++++-- fdbserver/ClusterRecruitmentInterface.h | 7 +- fdbserver/DataDistribution.actor.cpp | 77 +++++------------ fdbserver/DataDistribution.actor.h | 1 + fdbserver/DataDistributorInterface.h | 36 +------- fdbserver/Knobs.cpp | 2 + fdbserver/Knobs.h | 2 + fdbserver/MasterProxyServer.actor.cpp | 17 ++-- fdbserver/Ratekeeper.actor.cpp | 110 ++++++++++++++++++------ fdbserver/Ratekeeper.h | 35 -------- fdbserver/RatekeeperInterface.h | 93 ++++++++++++++++++++ fdbserver/ServerDBInfo.h | 2 + fdbserver/WorkerInterface.actor.h | 16 +++- fdbserver/fdbserver.vcxproj | 2 +- fdbserver/fdbserver.vcxproj.filters | 3 +- fdbserver/masterserver.actor.cpp | 1 - fdbserver/worker.actor.cpp | 26 +++++- flow/network.h | 1 + 22 files changed, 377 insertions(+), 178 deletions(-) delete mode 100644 fdbserver/Ratekeeper.h create mode 100644 fdbserver/RatekeeperInterface.h diff --git a/fdbrpc/Locality.cpp b/fdbrpc/Locality.cpp index d1f1957d2a..ff9135d77e 100644 --- a/fdbrpc/Locality.cpp +++ b/fdbrpc/Locality.cpp @@ -185,6 +185,29 @@ ProcessClass::Fitness ProcessClass::machineClassFitness( ClusterRole role ) cons default: return ProcessClass::WorstFit; } + case ProcessClass::RateKeeper: + switch( _class ) { + case ProcessClass::RateKeeperClass: + return ProcessClass::BestFit; + case ProcessClass::StatelessClass: + return ProcessClass::GoodFit; + case ProcessClass::MasterClass: + return ProcessClass::OkayFit; + case ProcessClass::ResolutionClass: + return ProcessClass::OkayFit; + case ProcessClass::TransactionClass: + return ProcessClass::OkayFit; + case ProcessClass::ProxyClass: + return ProcessClass::OkayFit; + case ProcessClass::UnsetClass: + return ProcessClass::UnsetFit; + case ProcessClass::CoordinatorClass: + return ProcessClass::NeverAssign; + case ProcessClass::TesterClass: + return ProcessClass::NeverAssign; + default: + return ProcessClass::WorstFit; + } default: return ProcessClass::NeverAssign; } diff --git a/fdbrpc/Locality.h b/fdbrpc/Locality.h index bae2fd69e8..1415ad9eff 100644 --- a/fdbrpc/Locality.h +++ b/fdbrpc/Locality.h @@ -26,9 +26,9 @@ struct ProcessClass { // This enum is stored in restartInfo.ini for upgrade tests, so be very careful about changing the existing items! - enum ClassType { UnsetClass, StorageClass, TransactionClass, ResolutionClass, TesterClass, ProxyClass, MasterClass, StatelessClass, LogClass, ClusterControllerClass, LogRouterClass, DataDistributorClass, CoordinatorClass, InvalidClass = -1 }; + enum ClassType { UnsetClass, StorageClass, TransactionClass, ResolutionClass, TesterClass, ProxyClass, MasterClass, StatelessClass, LogClass, ClusterControllerClass, LogRouterClass, DataDistributorClass, CoordinatorClass, RateKeeperClass, InvalidClass = -1 }; enum Fitness { BestFit, GoodFit, UnsetFit, OkayFit, WorstFit, ExcludeFit, NeverAssign }; //cannot be larger than 7 because of leader election mask - enum ClusterRole { Storage, TLog, Proxy, Master, Resolver, LogRouter, ClusterController, DataDistributor, NoRole }; + enum ClusterRole { Storage, TLog, Proxy, Master, Resolver, LogRouter, ClusterController, DataDistributor, RateKeeper, NoRole }; enum ClassSource { CommandLineSource, AutoSource, DBSource, InvalidSource = -1 }; int16_t _class; int16_t _source; @@ -50,6 +50,7 @@ public: else if (s=="cluster_controller") _class = ClusterControllerClass; else if (s=="data_distributor") _class = DataDistributorClass; else if (s=="coordinator") _class = CoordinatorClass; + else if (s=="ratekeeper") _class = RateKeeperClass; else _class = InvalidClass; } @@ -67,6 +68,7 @@ public: else if (classStr=="cluster_controller") _class = ClusterControllerClass; else if (classStr=="data_distributor") _class = DataDistributorClass; else if (classStr=="coordinator") _class = CoordinatorClass; + else if (classStr=="ratekeeper") _class = RateKeeperClass; else _class = InvalidClass; if (sourceStr=="command_line") _source = CommandLineSource; @@ -99,6 +101,7 @@ public: case ClusterControllerClass: return "cluster_controller"; case DataDistributorClass: return "data_distributor"; case CoordinatorClass: return "coordinator"; + case RateKeeperClass: return "ratekeeper"; default: return "invalid"; } } diff --git a/fdbrpc/simulator.h b/fdbrpc/simulator.h index 2bfd34e98f..2987c80655 100644 --- a/fdbrpc/simulator.h +++ b/fdbrpc/simulator.h @@ -99,6 +99,7 @@ public: case ProcessClass::LogRouterClass: return false; case ProcessClass::ClusterControllerClass: return false; case ProcessClass::DataDistributorClass: return false; + case ProcessClass::RateKeeperClass: return false; default: return false; } } diff --git a/fdbserver/CMakeLists.txt b/fdbserver/CMakeLists.txt index 9e630d4c7c..58853a2fee 100644 --- a/fdbserver/CMakeLists.txt +++ b/fdbserver/CMakeLists.txt @@ -57,6 +57,7 @@ set(FDBSERVER_SRCS QuietDatabase.h Ratekeeper.actor.cpp Ratekeeper.h + RatekeeperInterface.h RecoveryState.h Restore.actor.cpp RestoreInterface.h diff --git a/fdbserver/ClusterController.actor.cpp b/fdbserver/ClusterController.actor.cpp index aadebd93c2..d95b9c1385 100644 --- a/fdbserver/ClusterController.actor.cpp +++ b/fdbserver/ClusterController.actor.cpp @@ -30,6 +30,7 @@ #include "fdbserver/LogSystemConfig.h" #include "fdbserver/WaitFailure.h" #include "fdbserver/ClusterRecruitmentInterface.h" +#include "fdbserver/RatekeeperInterface.h" #include "fdbserver/ServerDBInfo.h" #include "fdbserver/Status.h" #include "fdbserver/LatencyBandConfig.h" @@ -110,17 +111,28 @@ public: { } - void setDistributor(const DataDistributorInterface& distributorInterf) { + void setDistributor(const DataDistributorInterface& interf) { ServerDBInfo newInfo = serverInfo->get(); newInfo.id = g_random->randomUniqueID(); - newInfo.distributor = distributorInterf; + newInfo.distributor = interf; serverInfo->set( newInfo ); } - void clearDistributor() { + void setRatekeeper(const RatekeeperInterface& interf) { ServerDBInfo newInfo = serverInfo->get(); newInfo.id = g_random->randomUniqueID(); - newInfo.distributor = Optional(); + newInfo.ratekeeper = interf; + serverInfo->set( newInfo ); + } + + void clearInterf(ProcessClass::ClassType t) { + ServerDBInfo newInfo = serverInfo->get(); + newInfo.id = g_random->randomUniqueID(); + if (t == ProcessClass::DataDistributorClass) { + newInfo.distributor = Optional(); + } else if (t == ProcessClass::RateKeeperClass) { + newInfo.ratekeeper = Optional(); + } serverInfo->set( newInfo ); } }; @@ -524,6 +536,9 @@ public: if (db.serverInfo->get().distributor.present()) { (*id_used)[db.serverInfo->get().distributor.get().locality.processId()]++; } + if (db.serverInfo->get().ratekeeper.present()) { + (*id_used)[db.serverInfo->get().ratekeeper.get().locality.processId()]++; + } } RecruitRemoteFromConfigurationReply findRemoteWorkersForConfiguration( RecruitRemoteFromConfigurationRequest const& req ) { @@ -1752,7 +1767,7 @@ void registerWorker( RegisterWorkerRequest req, ClusterControllerData *self ) { if ( req.distributorInterf.present() && !self->db.serverInfo->get().distributor.present() ) { const DataDistributorInterface& di = req.distributorInterf.get(); TraceEvent("ClusterController_RegisterDataDistributor", self->id).detail("DDID", di.id()); - self->db.setDistributor( di ); + self->db.setDistributor(di); } if( info == self->id_worker.end() ) { self->id_worker[w.locality.processId()] = WorkerInfo( workerAvailabilityWatch( w, newProcessClass, self ), req.reply, req.generation, w, req.initialClass, newProcessClass, newPriorityInfo ); @@ -2341,7 +2356,7 @@ ACTOR Future startDataDistributor( ClusterControllerDa } } -ACTOR Future waitDDRejoinOrStartDD( ClusterControllerData *self, ClusterControllerFullInterface *clusterInterface ) { +ACTOR Future waitDDRejoinOrStartDD(ClusterControllerData *self) { state Future initialDelay = delay(SERVER_KNOBS->WAIT_FOR_DISTRIBUTOR_JOIN_DELAY); // wait for a while to see if existing data distributor will join. @@ -2361,10 +2376,68 @@ ACTOR Future waitDDRejoinOrStartDD( ClusterControllerData *self, ClusterCo wait( waitFailureClient( self->db.serverInfo->get().distributor.get().waitFailure, SERVER_KNOBS->DD_FAILURE_TIME ) ); TraceEvent("ClusterController", self->id) .detail("DataDistributorDied", self->db.serverInfo->get().distributor.get().id()); - self->db.clearDistributor(); + self->db.clearInterf(ProcessClass::DataDistributorClass); } else { DataDistributorInterface distributorInterf = wait( startDataDistributor(self) ); - self->db.setDistributor( distributorInterf ); + self->db.setDistributor(distributorInterf); + } + } +} + +ACTOR Future startRatekeeper(ClusterControllerData *self) { + loop { + try { + while ( self->db.serverInfo->get().recoveryState < RecoveryState::ACCEPTING_COMMITS ) { + wait( self->db.serverInfo->onChange() ); + } + + std::map>, int> id_used = self->getUsedIds(); + Optional dcId = self->clusterControllerDcId; + state WorkerFitnessInfo rkWorker = self->getWorkerForRoleInDatacenter(dcId, ProcessClass::RateKeeper, ProcessClass::NeverAssign, self->db.config, id_used); + state InitializeRatekeeperRequest req; + req.reqId = g_random->randomUniqueID(); + TraceEvent("ClusterController_RecruitRatekeeper", req.reqId).detail("Addr", rkWorker.worker.first.address()); + + ErrorOr interf = wait( rkWorker.worker.first.ratekeeper.getReplyUnlessFailedFor(req, SERVER_KNOBS->WAIT_FOR_DISTRIBUTOR_JOIN_DELAY, 0) ); + if (interf.present()) { + TraceEvent("ClusterController_RatekeeperRecruited", req.reqId).detail("Addr", rkWorker.worker.first.address()); + return interf.get(); + } + } + catch (Error& e) { + TraceEvent("ClusterController_RatekeeperRecruitError", req.reqId).error(e); + if ( e.code() != error_code_no_more_servers ) { + throw; + } + } + wait( delay(SERVER_KNOBS->ATTEMPT_RECRUITMENT_DELAY) ); + } +} + +ACTOR Future waitRKRejoinOrStartRK(ClusterControllerData *self) { + state Future initialDelay = delay(SERVER_KNOBS->WAIT_FOR_RATEKEEPER_JOIN_DELAY); + + // wait for a while to see if an existing ratekeeper will join. + loop choose { + when ( wait(initialDelay) ) { break; } + when ( wait(self->db.serverInfo->onChange()) ) { // Rejoins via worker registration + if ( self->db.serverInfo->get().ratekeeper.present() ) { + TraceEvent("ClusterController_GotRateKeeper", self->id) + .detail("RKID", self->db.serverInfo->get().ratekeeper.get().id()); + break; + } + } + } + + loop { + if ( self->db.serverInfo->get().ratekeeper.present() ) { + wait( waitFailureClient( self->db.serverInfo->get().ratekeeper.get().waitFailure, SERVER_KNOBS->RATEKEEPER_FAILURE_TIME ) ); + TraceEvent("ClusterController", self->id) + .detail("RatekeeperDied", self->db.serverInfo->get().ratekeeper.get().id()); + self->db.clearInterf(ProcessClass::RateKeeperClass); + } else { + RatekeeperInterface rkInterf = wait( startRatekeeper(self) ); + self->db.setRatekeeper(rkInterf); } } } @@ -2385,8 +2458,9 @@ ACTOR Future clusterControllerCore( ClusterControllerFullInterface interf, self.addActor.send( updatedChangingDatacenters(&self) ); self.addActor.send( updatedChangedDatacenters(&self) ); self.addActor.send( updateDatacenterVersionDifference(&self) ); - self.addActor.send( waitDDRejoinOrStartDD(&self, &interf) ); self.addActor.send( handleForcedRecoveries(&self, interf) ); + self.addActor.send( waitDDRejoinOrStartDD(&self) ); + self.addActor.send( waitRKRejoinOrStartRK(&self) ); //printf("%s: I am the cluster controller\n", g_network->getLocalAddress().toString().c_str()); loop choose { diff --git a/fdbserver/ClusterRecruitmentInterface.h b/fdbserver/ClusterRecruitmentInterface.h index aefbe167c8..db49a9d075 100644 --- a/fdbserver/ClusterRecruitmentInterface.h +++ b/fdbserver/ClusterRecruitmentInterface.h @@ -168,15 +168,16 @@ struct RegisterWorkerRequest { ClusterControllerPriorityInfo priorityInfo; Generation generation; Optional distributorInterf; + Optional ratekeeperInterf; ReplyPromise reply; RegisterWorkerRequest() : priorityInfo(ProcessClass::UnsetFit, false, ClusterControllerPriorityInfo::FitnessUnknown) {} - RegisterWorkerRequest(WorkerInterface wi, ProcessClass initialClass, ProcessClass processClass, ClusterControllerPriorityInfo priorityInfo, Generation generation, Optional ddInterf) : - wi(wi), initialClass(initialClass), processClass(processClass), priorityInfo(priorityInfo), generation(generation), distributorInterf(ddInterf) {} + RegisterWorkerRequest(WorkerInterface wi, ProcessClass initialClass, ProcessClass processClass, ClusterControllerPriorityInfo priorityInfo, Generation generation, Optional ddInterf, Optional rkInterf) : + wi(wi), initialClass(initialClass), processClass(processClass), priorityInfo(priorityInfo), generation(generation), distributorInterf(ddInterf), ratekeeperInterf(rkInterf) {} template void serialize( Ar& ar ) { - serializer(ar, wi, initialClass, processClass, priorityInfo, generation, distributorInterf, reply); + serializer(ar, wi, initialClass, processClass, priorityInfo, generation, distributorInterf, ratekeeperInterf, reply); } }; diff --git a/fdbserver/DataDistribution.actor.cpp b/fdbserver/DataDistribution.actor.cpp index 48f5732fca..fe52e5e42a 100644 --- a/fdbserver/DataDistribution.actor.cpp +++ b/fdbserver/DataDistribution.actor.cpp @@ -29,7 +29,6 @@ #include "fdbserver/WaitFailure.h" #include "fdbserver/ServerDBInfo.h" #include "fdbserver/IKeyValueStore.h" -#include "fdbserver/Ratekeeper.h" #include "fdbclient/ManagementAPI.actor.h" #include "fdbrpc/Replication.h" #include "flow/UnitTest.h" @@ -570,7 +569,6 @@ struct DDTeamCollection : ReferenceCounted { PromiseStream removedServers; std::set recruitingIds; // The IDs of the SS which are being recruited std::set recruitingLocalities; - Optional> >> serverChanges; Future initialFailureReactionDelay; Future initializationDoneActor; Promise serverTrackerErrorOut; @@ -629,13 +627,12 @@ struct DDTeamCollection : ReferenceCounted { Reference const& shardsAffectedByTeamFailure, DatabaseConfiguration configuration, std::vector> includedDCs, Optional>> otherTrackedDCs, - Optional>>> const& serverChanges, Future readyToStart, Reference> zeroHealthyTeams, bool primary, Reference> processingUnhealthy) : cx(cx), distributorId(distributorId), lock(lock), output(output), shardsAffectedByTeamFailure(shardsAffectedByTeamFailure), doBuildTeams(true), teamBuilder(Void()), badTeamRemover(Void()), redundantTeamRemover(Void()), configuration(configuration), - serverChanges(serverChanges), readyToStart(readyToStart), + readyToStart(readyToStart), checkTeamDelay(delay(SERVER_KNOBS->CHECK_TEAM_DELAY, TaskDataDistribution)), initialFailureReactionDelay( delayed(readyToStart, SERVER_KNOBS->INITIAL_FAILURE_REACTION_DELAY, TaskDataDistribution)), @@ -2839,10 +2836,6 @@ ACTOR Future storageServerTracker( state Future storeTracker = keyValueStoreTypeTracker( self, server ); state bool hasWrongStoreTypeOrDC = false; - if(self->serverChanges.present()) { - self->serverChanges.get().send( std::make_pair(server->id, server->lastKnownInterface) ); - } - try { loop { status.isUndesired = false; @@ -2933,9 +2926,6 @@ ACTOR Future storageServerTracker( when( wait( failureTracker ) ) { // The server is failed AND all data has been removed from it, so permanently remove it. TraceEvent("StatusMapChange", self->distributorId).detail("ServerID", server->id).detail("Status", "Removing"); - if(self->serverChanges.present()) { - self->serverChanges.get().send( std::make_pair(server->id, Optional()) ); - } if(server->updated.canBeSet()) { server->updated.send(Void()); @@ -3040,9 +3030,6 @@ ACTOR Future storageServerTracker( } interfaceChanged = server->onInterfaceChanged; - if(self->serverChanges.present()) { - self->serverChanges.get().send( std::make_pair(server->id, server->lastKnownInterface) ); - } // We rely on the old failureTracker being actorCancelled since the old actor now has a pointer to an invalid location status = ServerStatus( status.isFailed, status.isUndesired, server->lastKnownInterface.locality ); @@ -3460,13 +3447,11 @@ ACTOR Future pollMoveKeysLock( Database cx, MoveKeysLock lock ) { } } -ACTOR Future dataDistribution( - Reference> db, - UID myId, - PromiseStream< std::pair> > serverChanges, - double* lastLimited) +ACTOR Future dataDistribution(Reference self, + double* lastLimited) { - state Database cx = openDBOnServer(db, TaskDataDistributionLaunch, true, true); + state Database cx = openDBOnServer(self->dbInfo, TaskDataDistributionLaunch, true, true); + state DatabaseConfiguration configuration = self->configuration->get(); cx->locationCacheSize = SERVER_KNOBS->DD_LOCATION_CACHE_SIZE; //cx->setOption( FDBDatabaseOptions::LOCATION_CACHE_SIZE, StringRef((uint8_t*) &SERVER_KNOBS->DD_LOCATION_CACHE_SIZE, 8) ); @@ -3532,20 +3517,20 @@ ACTOR Future dataDistribution( Reference initData_ = wait( getInitialDataDistribution(cx, myId, lock, configuration.usableRegions > 1 ? remoteDcIds : std::vector>() ) ); initData = initData_; if(initData->shards.size() > 1) { - TraceEvent("DDInitGotInitialDD", myId) + TraceEvent("DDInitGotInitialDD", self->ddId) .detail("B", printable(initData->shards.end()[-2].key)) .detail("E", printable(initData->shards.end()[-1].key)) .detail("Src", describe(initData->shards.end()[-2].primarySrc)) .detail("Dest", describe(initData->shards.end()[-2].primaryDest)) .trackLatest("InitialDD"); } else { - TraceEvent("DDInitGotInitialDD", myId).detail("B","").detail("E", "").detail("Src", "[no items]").detail("Dest", "[no items]").trackLatest("InitialDD"); + TraceEvent("DDInitGotInitialDD", self->ddId).detail("B","").detail("E", "").detail("Src", "[no items]").detail("Dest", "[no items]").trackLatest("InitialDD"); } if (initData->mode) break; // mode may be set true by system operator using fdbcli - TraceEvent("DataDistributionDisabled", myId); + TraceEvent("DataDistributionDisabled", self->ddId); - TraceEvent("MovingData", myId) + TraceEvent("MovingData", self->ddId) .detail( "InFlight", 0 ) .detail( "InQueue", 0 ) .detail( "AverageShardSize", -1 ) @@ -3554,8 +3539,8 @@ ACTOR Future dataDistribution( .detail( "HighestPriority", 0 ) .trackLatest( "MovingData" ); - TraceEvent("TotalDataInFlight", myId).detail("Primary", true).detail("TotalBytes", 0).detail("UnhealthyServers", 0).detail("HighestPriority", 0).trackLatest("TotalDataInFlight"); - TraceEvent("TotalDataInFlight", myId).detail("Primary", false).detail("TotalBytes", 0).detail("UnhealthyServers", 0).detail("HighestPriority", configuration.usableRegions > 1 ? 0 : -1).trackLatest("TotalDataInFlightRemote"); + TraceEvent("TotalDataInFlight", self->ddId).detail("Primary", true).detail("TotalBytes", 0).detail("UnhealthyServers", 0).detail("HighestPriority", 0).trackLatest("TotalDataInFlight"); + TraceEvent("TotalDataInFlight", self->ddId).detail("Primary", false).detail("TotalBytes", 0).detail("UnhealthyServers", 0).detail("HighestPriority", configuration.usableRegions > 1 ? 0 : -1).trackLatest("TotalDataInFlightRemote"); wait( waitForDataDistributionEnabled(cx) ); TraceEvent("DataDistributionEnabled"); @@ -3573,12 +3558,12 @@ ACTOR Future dataDistribution( state Reference shardsAffectedByTeamFailure( new ShardsAffectedByTeamFailure ); state int shard = 0; - for(; shardshards.size() - 1; shard++) { + for (; shard < initData->shards.size() - 1; shard++) { KeyRangeRef keys = KeyRangeRef(initData->shards[shard].key, initData->shards[shard+1].key); shardsAffectedByTeamFailure->defineShard(keys); std::vector teams; teams.push_back(ShardsAffectedByTeamFailure::Team(initData->shards[shard].primarySrc, true)); - if(configuration.usableRegions > 1) { + if (configuration.usableRegions > 1) { teams.push_back(ShardsAffectedByTeamFailure::Team(initData->shards[shard].remoteSrc, false)); } if(g_network->isSimulated()) { @@ -3587,11 +3572,11 @@ ACTOR Future dataDistribution( } shardsAffectedByTeamFailure->moveShard(keys, teams); - if(initData->shards[shard].hasDest) { + if (initData->shards[shard].hasDest) { // This shard is already in flight. Ideally we should use dest in sABTF and generate a dataDistributionRelocator directly in // DataDistributionQueue to track it, but it's easier to just (with low priority) schedule it for movement. bool unhealthy = initData->shards[shard].primarySrc.size() != configuration.storageTeamSize; - if(!unhealthy && configuration.usableRegions > 1) { + if (!unhealthy && configuration.usableRegions > 1) { unhealthy = initData->shards[shard].remoteSrc.size() != configuration.storageTeamSize; } output.send( RelocateShard( keys, unhealthy ? PRIORITY_TEAM_UNHEALTHY : PRIORITY_RECOVER_MOVE ) ); @@ -3620,20 +3605,20 @@ ACTOR Future dataDistribution( } actors.push_back( pollMoveKeysLock(cx, lock) ); - actors.push_back( reportErrorsExcept( dataDistributionTracker( initData, cx, output, shardsAffectedByTeamFailure, getShardMetrics, getAverageShardBytes.getFuture(), readyToStart, anyZeroHealthyTeams, myId ), "DDTracker", myId, &normalDDQueueErrors() ) ); - actors.push_back( reportErrorsExcept( dataDistributionQueue( cx, output, input.getFuture(), getShardMetrics, processingUnhealthy, tcis, shardsAffectedByTeamFailure, lock, getAverageShardBytes, myId, storageTeamSize, lastLimited ), "DDQueue", myId, &normalDDQueueErrors() ) ); + actors.push_back( reportErrorsExcept( dataDistributionTracker( initData, cx, output, shardsAffectedByTeamFailure, getShardMetrics, getAverageShardBytes.getFuture(), readyToStart, anyZeroHealthyTeams, self->ddId ), "DDTracker", self->ddId, &normalDDQueueErrors() ) ); + actors.push_back( reportErrorsExcept( dataDistributionQueue( cx, output, input.getFuture(), getShardMetrics, processingUnhealthy, tcis, shardsAffectedByTeamFailure, lock, getAverageShardBytes, self->ddId, storageTeamSize, lastLimited ), "DDQueue", self->ddId, &normalDDQueueErrors() ) ); vector teamCollectionsPtrs; - Reference primaryTeamCollection( new DDTeamCollection(cx, myId, lock, output, shardsAffectedByTeamFailure, configuration, primaryDcId, configuration.usableRegions > 1 ? remoteDcIds : std::vector>(), serverChanges, readyToStart.getFuture(), zeroHealthyTeams[0], true, processingUnhealthy) ); + Reference primaryTeamCollection( new DDTeamCollection(cx, self->ddId, lock, output, shardsAffectedByTeamFailure, configuration, self->primaryDcId, configuration.usableRegions > 1 ? self->remoteDcIds : std::vector>(), readyToStart.getFuture(), zeroHealthyTeams[0], true, processingUnhealthy) ); teamCollectionsPtrs.push_back(primaryTeamCollection.getPtr()); if (configuration.usableRegions > 1) { - Reference remoteTeamCollection( new DDTeamCollection(cx, myId, lock, output, shardsAffectedByTeamFailure, configuration, remoteDcIds, Optional>>(), serverChanges, readyToStart.getFuture() && remoteRecovered(db), zeroHealthyTeams[1], false, processingUnhealthy) ); + Reference remoteTeamCollection( new DDTeamCollection(cx, self->ddId, lock, output, shardsAffectedByTeamFailure, configuration, self->remoteDcIds, Optional>>(), readyToStart.getFuture() && remoteRecovered(self->dbInfo), zeroHealthyTeams[1], false, processingUnhealthy) ); teamCollectionsPtrs.push_back(remoteTeamCollection.getPtr()); remoteTeamCollection->teamCollections = teamCollectionsPtrs; - actors.push_back( reportErrorsExcept( dataDistributionTeamCollection( remoteTeamCollection, initData, tcis[1], db ), "DDTeamCollectionSecondary", myId, &normalDDQueueErrors() ) ); + actors.push_back( reportErrorsExcept( dataDistributionTeamCollection( remoteTeamCollection, initData, tcis[1], self->dbInfo ), "DDTeamCollectionSecondary", self->ddId, &normalDDQueueErrors() ) ); } primaryTeamCollection->teamCollections = teamCollectionsPtrs; - actors.push_back( reportErrorsExcept( dataDistributionTeamCollection( primaryTeamCollection, initData, tcis[0], db ), "DDTeamCollectionPrimary", myId, &normalDDQueueErrors() ) ); + actors.push_back( reportErrorsExcept( dataDistributionTeamCollection( primaryTeamCollection, initData, tcis[0], self->dbInfo ), "DDTeamCollectionPrimary", self->ddId, &normalDDQueueErrors() ) ); actors.push_back(yieldPromiseStream(output.getFuture(), input)); wait( waitForAll( actors ) ); @@ -3654,7 +3639,6 @@ ACTOR Future dataDistribution( struct DataDistributorData : NonCopyable, ReferenceCounted { Reference> dbInfo; UID ddId; - PromiseStream< std::pair> > ddStorageServerChanges; PromiseStream> addActor; DataDistributorData(Reference> const& db, UID id) : dbInfo(db), ddId(id) {} @@ -3672,19 +3656,7 @@ static std::set const& normalDataDistributorErrors() { return s; } -static std::set const& normalRateKeeperErrors() { - static std::set s; - if (s.empty()) { - s.insert( error_code_worker_removed ); - s.insert( error_code_broken_promise ); - s.insert( error_code_actor_cancelled ); - s.insert( error_code_please_reboot ); - } - return s; -} - ACTOR Future dataDistributor(DataDistributorInterface di, Reference> db ) { - state UID lastClusterControllerID(0,0); state Reference self( new DataDistributorData(db, di.id()) ); state Future collection = actorCollection( self->addActor.getFuture() ); @@ -3693,10 +3665,8 @@ ACTOR Future dataDistributor(DataDistributorInterface di, Reference> > ddStorageServerChanges; state double lastLimited = 0; - state Future distributor = reportErrorsExcept( dataDistribution( self->dbInfo, di.id(), ddStorageServerChanges, &lastLimited ), "DataDistribution", di.id(), &normalDataDistributorErrors() ); - self->addActor.send( reportErrorsExcept( rateKeeper( self->dbInfo, ddStorageServerChanges, di.getRateInfo.getFuture(), &lastLimited ), "Ratekeeper", di.id(), &normalRateKeeperErrors() ) ); + state Future distributor = reportErrorsExcept( dataDistribution( self->dbInfo, &lastLimited ), "DataDistribution", di.id(), &normalDataDistributorErrors() ); wait( distributor || collection ); } @@ -3732,7 +3702,6 @@ DDTeamCollection* testTeamCollection(int teamSize, IRepPolicyRef policy, int pro conf, {}, {}, - PromiseStream>>(), Future(Void()), Reference>( new AsyncVar(true) ), true, @@ -3765,7 +3734,7 @@ DDTeamCollection* testMachineTeamCollection(int teamSize, IRepPolicyRef policy, DDTeamCollection* collection = new DDTeamCollection(database, UID(0, 0), MoveKeysLock(), PromiseStream(), Reference(new ShardsAffectedByTeamFailure()), conf, {}, {}, - PromiseStream>>(), Future(Void()), + Future(Void()), Reference>(new AsyncVar(true)), true, Reference>(new AsyncVar(false))); diff --git a/fdbserver/DataDistribution.actor.h b/fdbserver/DataDistribution.actor.h index b838217192..1baff7f58e 100644 --- a/fdbserver/DataDistribution.actor.h +++ b/fdbserver/DataDistribution.actor.h @@ -253,5 +253,6 @@ int64_t getMaxShardSize( double dbSizeEstimate ); class DDTeamCollection; ACTOR Future teamRemover(DDTeamCollection* self); ACTOR Future teamRemoverPeriodic(DDTeamCollection* self); +ACTOR Future>> getServerListAndProcessClasses(Transaction* tr); #endif diff --git a/fdbserver/DataDistributorInterface.h b/fdbserver/DataDistributorInterface.h index 2150eb08e5..d437fc69ae 100644 --- a/fdbserver/DataDistributorInterface.h +++ b/fdbserver/DataDistributorInterface.h @@ -27,15 +27,14 @@ struct DataDistributorInterface { RequestStream> waitFailure; - RequestStream getRateInfo; struct LocalityData locality; DataDistributorInterface() {} explicit DataDistributorInterface(const struct LocalityData& l) : locality(l) {} void initEndpoints() {} - UID id() const { return getRateInfo.getEndpoint().token; } - NetworkAddress address() const { return getRateInfo.getEndpoint().getPrimaryAddress(); } + UID id() const { return waitFailure.getEndpoint().token; } + NetworkAddress address() const { return waitFailure.getEndpoint().getPrimaryAddress(); } bool operator== (const DataDistributorInterface& r) const { return id() == r.id(); } @@ -45,36 +44,7 @@ struct DataDistributorInterface { template void serialize(Archive& ar) { - serializer(ar, waitFailure, getRateInfo, locality); - } -}; - -struct GetRateInfoRequest { - UID requesterID; - int64_t totalReleasedTransactions; - int64_t batchReleasedTransactions; - bool detailed; - ReplyPromise reply; - - GetRateInfoRequest() {} - GetRateInfoRequest(UID const& requesterID, int64_t totalReleasedTransactions, int64_t batchReleasedTransactions, bool detailed) - : requesterID(requesterID), totalReleasedTransactions(totalReleasedTransactions), batchReleasedTransactions(batchReleasedTransactions), detailed(detailed) {} - - template - void serialize(Ar& ar) { - serializer(ar, requesterID, totalReleasedTransactions, batchReleasedTransactions, detailed, reply); - } -}; - -struct GetRateInfoReply { - double transactionRate; - double batchTransactionRate; - double leaseDuration; - HealthMetrics healthMetrics; - - template - void serialize(Ar& ar) { - serializer(ar, transactionRate, batchTransactionRate, leaseDuration, healthMetrics); + serializer(ar, waitFailure, locality); } }; diff --git a/fdbserver/Knobs.cpp b/fdbserver/Knobs.cpp index 8b37bb55fb..815d933211 100644 --- a/fdbserver/Knobs.cpp +++ b/fdbserver/Knobs.cpp @@ -307,11 +307,13 @@ ServerKnobs::ServerKnobs(bool randomize, ClientKnobs* clientKnobs) { init( WAIT_FOR_GOOD_REMOTE_RECRUITMENT_DELAY, 5.0 ); init( ATTEMPT_RECRUITMENT_DELAY, 0.035 ); init( WAIT_FOR_DISTRIBUTOR_JOIN_DELAY, 1.0 ); + init( WAIT_FOR_RATEKEEPER_JOIN_DELAY, 1.0 ); init( WORKER_FAILURE_TIME, 1.0 ); if( randomize && BUGGIFY ) WORKER_FAILURE_TIME = 10.0; init( CHECK_OUTSTANDING_INTERVAL, 0.5 ); if( randomize && BUGGIFY ) CHECK_OUTSTANDING_INTERVAL = 0.001; init( VERSION_LAG_METRIC_INTERVAL, 0.5 ); if( randomize && BUGGIFY ) VERSION_LAG_METRIC_INTERVAL = 10.0; init( MAX_VERSION_DIFFERENCE, 20 * VERSIONS_PER_SECOND ); init( FORCE_RECOVERY_CHECK_DELAY, 5.0 ); + init( RATEKEEPER_FAILURE_TIME, 1.0 ); init( INCOMPATIBLE_PEERS_LOGGING_INTERVAL, 600 ); if( randomize && BUGGIFY ) INCOMPATIBLE_PEERS_LOGGING_INTERVAL = 60.0; init( EXPECTED_MASTER_FITNESS, ProcessClass::UnsetFit ); diff --git a/fdbserver/Knobs.h b/fdbserver/Knobs.h index f3698b3561..b2b77861db 100644 --- a/fdbserver/Knobs.h +++ b/fdbserver/Knobs.h @@ -248,12 +248,14 @@ public: double WAIT_FOR_GOOD_REMOTE_RECRUITMENT_DELAY; double ATTEMPT_RECRUITMENT_DELAY; double WAIT_FOR_DISTRIBUTOR_JOIN_DELAY; + double WAIT_FOR_RATEKEEPER_JOIN_DELAY; double WORKER_FAILURE_TIME; double CHECK_OUTSTANDING_INTERVAL; double INCOMPATIBLE_PEERS_LOGGING_INTERVAL; double VERSION_LAG_METRIC_INTERVAL; int64_t MAX_VERSION_DIFFERENCE; double FORCE_RECOVERY_CHECK_DELAY; + double RATEKEEPER_FAILURE_TIME; // Knobs used to select the best policy (via monte carlo) int POLICY_RATING_TESTS; // number of tests per policy (in order to compare) diff --git a/fdbserver/MasterProxyServer.actor.cpp b/fdbserver/MasterProxyServer.actor.cpp index 8755a69069..2b3299572e 100644 --- a/fdbserver/MasterProxyServer.actor.cpp +++ b/fdbserver/MasterProxyServer.actor.cpp @@ -97,18 +97,15 @@ ACTOR Future getRate(UID myID, Reference> db, int64 state int64_t lastTC = 0; - if (db->get().distributor.present()) { - nextRequestTimer = Void(); - } - + if (db->get().ratekeeper.present()) nextRequestTimer = Void(); loop choose { when ( wait( db->onChange() ) ) { - if ( db->get().distributor.present() ) { - TraceEvent("Proxy_DataDistributorChanged", myID) - .detail("DDID", db->get().distributor.get().id()); - nextRequestTimer = Void(); // trigger GetRate request + if ( db->get().ratekeeper.present() ) { + TraceEvent("Proxy_RatekeeperChanged", myID) + .detail("RKID", db->get().ratekeeper.get().id()); + nextRequestTimer = Void(); // trigger GetRate request } else { - TraceEvent("Proxy_DataDistributorDied", myID); + TraceEvent("Proxy_RatekeeperDied", myID); nextRequestTimer = Never(); reply = Never(); } @@ -116,7 +113,7 @@ ACTOR Future getRate(UID myID, Reference> db, int64 when ( wait( nextRequestTimer ) ) { nextRequestTimer = Never(); bool detailed = now() - lastDetailedReply > SERVER_KNOBS->DETAILED_METRIC_UPDATE_RATE; - reply = brokenPromiseToNever(db->get().distributor.get().getRateInfo.getReply(GetRateInfoRequest(myID, *inTransactionCount, *inBatchTransactionCount, detailed))); + reply = brokenPromiseToNever(db->get().ratekeeper.get().getRateInfo.getReply(GetRateInfoRequest(myID, *inTransactionCount, *inBatchTransactionCount, detailed))); expectingDetailedReply = detailed; } when ( GetRateInfoReply rep = wait(reply) ) { diff --git a/fdbserver/Ratekeeper.actor.cpp b/fdbserver/Ratekeeper.actor.cpp index 1baf876d51..83a8778411 100644 --- a/fdbserver/Ratekeeper.actor.cpp +++ b/fdbserver/Ratekeeper.actor.cpp @@ -19,13 +19,14 @@ */ #include "flow/IndexedSet.h" -#include "fdbserver/Ratekeeper.h" #include "fdbrpc/FailureMonitor.h" -#include "fdbserver/Knobs.h" #include "fdbrpc/Smoother.h" -#include "fdbserver/ServerDBInfo.h" #include "fdbrpc/simulator.h" #include "fdbclient/ReadYourWrites.h" +#include "fdbserver/Knobs.h" +#include "fdbserver/DataDistribution.h" +#include "fdbserver/ServerDBInfo.h" +#include "fdbserver/WaitFailure.h" #include "flow/actorcompiler.h" // This must be the last #include. enum limitReason_t { @@ -146,7 +147,7 @@ struct TransactionCounts { TransactionCounts() : total(0), batch(0), time(0) {} }; -struct Ratekeeper { +struct RatekeeperData { Map storageQueueInfo; Map tlogQueueInfo; @@ -154,6 +155,7 @@ struct Ratekeeper { Smoother smoothReleasedTransactions, smoothBatchReleasedTransactions, smoothTotalDurableBytes; HealthMetrics healthMetrics; DatabaseConfiguration configuration; + PromiseStream> addActor; Int64MetricHandle actualTpsMetric; @@ -163,7 +165,7 @@ struct Ratekeeper { RatekeeperLimits normalLimits; RatekeeperLimits batchLimits; - Ratekeeper() : smoothReleasedTransactions(SERVER_KNOBS->SMOOTHING_AMOUNT), smoothBatchReleasedTransactions(SERVER_KNOBS->SMOOTHING_AMOUNT), smoothTotalDurableBytes(SERVER_KNOBS->SLOW_SMOOTHING_AMOUNT), + RatekeeperData() : smoothReleasedTransactions(SERVER_KNOBS->SMOOTHING_AMOUNT), smoothBatchReleasedTransactions(SERVER_KNOBS->SMOOTHING_AMOUNT), smoothTotalDurableBytes(SERVER_KNOBS->SLOW_SMOOTHING_AMOUNT), actualTpsMetric(LiteralStringRef("Ratekeeper.ActualTPS")), lastWarning(0), normalLimits("", SERVER_KNOBS->TARGET_BYTES_PER_STORAGE_SERVER, SERVER_KNOBS->SPRING_BYTES_STORAGE_SERVER, SERVER_KNOBS->TARGET_BYTES_PER_TLOG, SERVER_KNOBS->SPRING_BYTES_TLOG, SERVER_KNOBS->MAX_TL_SS_VERSION_DIFFERENCE), @@ -172,7 +174,7 @@ struct Ratekeeper { }; //SOMEDAY: template trackStorageServerQueueInfo and trackTLogQueueInfo into one function -ACTOR Future trackStorageServerQueueInfo( Ratekeeper* self, StorageServerInterface ssi ) { +ACTOR Future trackStorageServerQueueInfo( RatekeeperData* self, StorageServerInterface ssi ) { self->storageQueueInfo.insert( mapPair(ssi.id(), StorageQueueInfo(ssi.id(), ssi.locality) ) ); state Map::iterator myQueueInfo = self->storageQueueInfo.find(ssi.id()); TraceEvent("RkTracking", ssi.id()); @@ -217,7 +219,7 @@ ACTOR Future trackStorageServerQueueInfo( Ratekeeper* self, StorageServerI } } -ACTOR Future trackTLogQueueInfo( Ratekeeper* self, TLogInterface tli ) { +ACTOR Future trackTLogQueueInfo( RatekeeperData* self, TLogInterface tli ) { self->tlogQueueInfo.insert( mapPair(tli.id(), TLogQueueInfo(tli.id()) ) ); state Map::iterator myQueueInfo = self->tlogQueueInfo.find(tli.id()); TraceEvent("RkTracking", tli.id()); @@ -270,7 +272,7 @@ ACTOR Future splitError( Future in, Promise errOut ) { } ACTOR Future trackEachStorageServer( - Ratekeeper* self, + RatekeeperData* self, FutureStream< std::pair> > serverChanges ) { state Map> actors; @@ -289,7 +291,59 @@ ACTOR Future trackEachStorageServer( } } -void updateRate( Ratekeeper* self, RatekeeperLimits &limits ) { +ACTOR Future monitorServerListChange( + Reference> dbInfo, + PromiseStream< std::pair> > serverChanges) { + state Database db = openDBOnServer(dbInfo, TaskRateKeeper, true, true); + state Future checkSignal = delay(SERVER_KNOBS->SERVER_LIST_DELAY); + state Future>> serverListAndProcessClasses = Never(); + state std::map oldServers; + state Transaction tr(db); + + loop { + try { + choose { + when ( wait( checkSignal ) ) { + checkSignal = Never(); + serverListAndProcessClasses = getServerListAndProcessClasses(&tr); + } + when ( vector> results = wait( serverListAndProcessClasses ) ) { + serverListAndProcessClasses = Never(); + + std::map newServers; + for( int i = 0; i < results.size(); i++ ) { + UID serverId = results[i].first.id(); + StorageServerInterface const& ssi = results[i].first; + newServers[serverId] = ssi; + + if ( oldServers.count( serverId ) ) { + if (ssi.getValue.getEndpoint() != oldServers[serverId].getValue.getEndpoint()) { + serverChanges.send( std::make_pair(serverId, Optional(ssi)) ); + } + oldServers.erase(serverId); + } else { + serverChanges.send( std::make_pair(serverId, Optional(ssi)) ); + } + } + + for (auto it : oldServers) { + serverChanges.send( std::make_pair(it.first, Optional()) ); + } + + oldServers.swap(newServers); + tr = Transaction(db); + checkSignal = delay(SERVER_KNOBS->SERVER_LIST_DELAY); + } + } + } catch(Error& e) { + wait( tr.onError(e) ); + serverListAndProcessClasses = Never(); + checkSignal = Void(); + } + } +} + +void updateRate( RatekeeperData* self, RatekeeperLimits &limits ) { //double controlFactor = ; // dt / eFoldingTime double actualTps = self->smoothReleasedTransactions.smoothRate(); @@ -566,7 +620,7 @@ void updateRate( Ratekeeper* self, RatekeeperLimits &limits ) { } } -ACTOR Future configurationMonitor( Ratekeeper* self, Reference> dbInfo ) { +ACTOR Future configurationMonitor(Reference> dbInfo, DatabaseConfiguration* conf) { state Database cx = openDBOnServer(dbInfo, TaskDefaultEndpoint, true, true); loop { state ReadYourWritesTransaction tr(cx); @@ -578,7 +632,7 @@ ACTOR Future configurationMonitor( Ratekeeper* self, Reference results = wait( tr.getRange( configKeys, CLIENT_KNOBS->TOO_MANY ) ); ASSERT( !results.more && results.size() < CLIENT_KNOBS->TOO_MANY ); - self->configuration.fromKeyValues( (VectorRef) results ); + conf->fromKeyValues( (VectorRef) results ); state Future watchFuture = tr.watch(moveKeysLockOwnerKey); wait( tr.commit() ); @@ -591,21 +645,26 @@ ACTOR Future configurationMonitor( Ratekeeper* self, Reference rateKeeper( - Reference> dbInfo, - PromiseStream< std::pair> > serverChanges, - FutureStream< struct GetRateInfoRequest > getRateInfo, - double* lastLimited) -{ - state Ratekeeper self; - state Future track = trackEachStorageServer( &self, serverChanges.getFuture() ); +ACTOR Future rateKeeper(RatekeeperInterface rkInterf, Reference> dbInfo) { + state RatekeeperData self; state Future timeout = Void(); state std::vector> actors; state std::vector> tlogTrackers; state std::vector tlogInterfs; state Promise err; - state Future configMonitor = configurationMonitor(&self, dbInfo); - self.lastLimited = lastLimited; + state Future collection = actorCollection( self.addActor.getFuture() ); + + // TODOs: + double lastLimited; + self.lastLimited = &lastLimited; + + TraceEvent("Ratekeeper_Starting", rkInterf.id()); + self.addActor.send( waitFailureServer(rkInterf.waitFailure.getFuture()) ); + self.addActor.send( configurationMonitor(dbInfo, &self.configuration) ); + + PromiseStream< std::pair> > serverChanges; + self.addActor.send( monitorServerListChange(dbInfo, serverChanges) ); + self.addActor.send( trackEachStorageServer(&self, serverChanges.getFuture()) ); TraceEvent("RkTLogQueueSizeParameters").detail("Target", SERVER_KNOBS->TARGET_BYTES_PER_TLOG).detail("Spring", SERVER_KNOBS->SPRING_BYTES_TLOG) .detail("Rate", (SERVER_KNOBS->TARGET_BYTES_PER_TLOG - SERVER_KNOBS->SPRING_BYTES_TLOG) / ((((double)SERVER_KNOBS->MAX_READ_TRANSACTION_LIFE_VERSIONS) / SERVER_KNOBS->VERSIONS_PER_SECOND) + 2.0)); @@ -619,7 +678,6 @@ ACTOR Future rateKeeper( loop{ choose { - when (wait( track )) { break; } when (wait( timeout )) { updateRate(&self, self.normalLimits); updateRate(&self, self.batchLimits); @@ -638,7 +696,7 @@ ACTOR Future rateKeeper( } timeout = delayJittered(SERVER_KNOBS->METRIC_UPDATE_RATE); } - when (GetRateInfoRequest req = waitNext(getRateInfo)) { + when (GetRateInfoRequest req = waitNext(rkInterf.getRateInfo.getFuture())) { GetRateInfoReply reply; auto& p = self.proxy_transactionCounts[ req.requesterID ]; @@ -672,8 +730,10 @@ ACTOR Future rateKeeper( tlogTrackers.push_back( splitError( trackTLogQueueInfo(&self, tlogInterfs[i]), err ) ); } } - when(wait(configMonitor)) {} + when ( wait(collection) ) { + ASSERT(false); + throw internal_error(); + } } } - return Void(); } diff --git a/fdbserver/Ratekeeper.h b/fdbserver/Ratekeeper.h deleted file mode 100644 index 282e99f766..0000000000 --- a/fdbserver/Ratekeeper.h +++ /dev/null @@ -1,35 +0,0 @@ -/* - * Ratekeeper.h - * - * This source file is part of the FoundationDB open source project - * - * Copyright 2013-2018 Apple Inc. and the FoundationDB project authors - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#ifndef FDBSERVER_RATEKEEPER_H -#define FDBSERVER_RATEKEEPER_H -#pragma once - -#include "fdbserver/MasterInterface.h" -#include "fdbserver/TLogInterface.h" -#include "fdbclient/DatabaseConfiguration.h" - -Future rateKeeper( - Reference> const& dbInfo, - PromiseStream< std::pair> > const& serverChanges, // actually an input, but we don't want broken_promise - FutureStream< struct GetRateInfoRequest > const& getRateInfo, - double* const& lastLimited); - -#endif diff --git a/fdbserver/RatekeeperInterface.h b/fdbserver/RatekeeperInterface.h new file mode 100644 index 0000000000..cb5049a595 --- /dev/null +++ b/fdbserver/RatekeeperInterface.h @@ -0,0 +1,93 @@ +/* + * DataDistributorInterface.h + * + * This source file is part of the FoundationDB open source project + * + * Copyright 2013-2019 Apple Inc. and the FoundationDB project authors + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef FDBSERVER_RATEKEEPERINTERFACE_H +#define FDBSERVER_RATEKEEPERINTERFACE_H + +#include "fdbclient/StorageServerInterface.h" +#include "fdbclient/FDBTypes.h" +#include "fdbrpc/fdbrpc.h" +#include "fdbrpc/Locality.h" + +struct RatekeeperInterface { + RequestStream> waitFailure; + RequestStream getRateInfo; + RequestStream changeStorage; + struct LocalityData locality; + + RatekeeperInterface() {} + explicit RatekeeperInterface(const struct LocalityData& l) : locality(l) {} + + void initEndpoints() {} + UID id() const { return getRateInfo.getEndpoint().token; } + NetworkAddress address() const { return getRateInfo.getEndpoint().address; } + bool operator== (const RatekeeperInterface& r) const { + return id() == r.id(); + } + bool operator!= (const RatekeeperInterface& r) const { + return !(*this == r); + } + + template + void serialize(Archive& ar) { + serializer(ar, waitFailure, getRateInfo, changeStorage, locality); + } +}; + +struct GetRateInfoRequest { + UID requesterID; + int64_t totalReleasedTransactions; + int64_t batchReleasedTransactions; + bool detailed; + ReplyPromise reply; + + GetRateInfoRequest() {} + GetRateInfoRequest(UID const& requesterID, int64_t totalReleasedTransactions, int64_t batchReleasedTransactions, bool detailed) + : requesterID(requesterID), totalReleasedTransactions(totalReleasedTransactions), batchReleasedTransactions(batchReleasedTransactions), detailed(detailed) {} + + template + void serialize(Ar& ar) { + serializer(ar, requesterID, totalReleasedTransactions, batchReleasedTransactions, detailed, reply); + } +}; + +struct GetRateInfoReply { + double transactionRate; + double batchTransactionRate; + double leaseDuration; + HealthMetrics healthMetrics; + + template + void serialize(Ar& ar) { + serializer(ar, transactionRate, batchTransactionRate, leaseDuration, healthMetrics); + } +}; + +struct StorageChangeRequest { + UID ssID; + Optional ssInterf; + + template + void serialize(Ar& ar) { + serializer(ar, ssID, ssInterf); + } +}; + +#endif //FDBSERVER_RATEKEEPERINTERFACE_H diff --git a/fdbserver/ServerDBInfo.h b/fdbserver/ServerDBInfo.h index abb7be412c..c5a76f831b 100644 --- a/fdbserver/ServerDBInfo.h +++ b/fdbserver/ServerDBInfo.h @@ -26,6 +26,7 @@ #include "fdbserver/DataDistributorInterface.h" #include "fdbserver/MasterInterface.h" #include "fdbserver/LogSystemConfig.h" +#include "fdbserver/RatekeeperInterface.h" #include "fdbserver/RecoveryState.h" #include "fdbserver/LatencyBandConfig.h" @@ -39,6 +40,7 @@ struct ServerDBInfo { ClientDBInfo client; // After a successful recovery, eventually proxies that communicate with it Optional distributor; // The best guess of current data distributor. MasterInterface master; // The best guess as to the most recent master, which might still be recovering + Optional ratekeeper; vector resolvers; DBRecoveryCount recoveryCount; // A recovery count from DBCoreState. A successful master recovery increments it twice; unsuccessful recoveries may increment it once. Depending on where the current master is in its recovery process, this might not have been written by the current master. RecoveryState recoveryState; diff --git a/fdbserver/WorkerInterface.actor.h b/fdbserver/WorkerInterface.actor.h index c3f6b1bb49..494ff3fcd3 100644 --- a/fdbserver/WorkerInterface.actor.h +++ b/fdbserver/WorkerInterface.actor.h @@ -28,6 +28,7 @@ #include "fdbserver/DataDistributorInterface.h" #include "fdbserver/MasterInterface.h" #include "fdbserver/TLogInterface.h" +#include "fdbserver/RatekeeperInterface.h" #include "fdbserver/ResolverInterface.h" #include "fdbclient/StorageServerInterface.h" #include "fdbserver/TesterInterface.actor.h" @@ -46,6 +47,7 @@ struct WorkerInterface { RequestStream< struct RecruitMasterRequest > master; RequestStream< struct InitializeMasterProxyRequest > masterProxy; RequestStream< struct InitializeDataDistributorRequest > dataDistributor; + RequestStream< struct InitializeRatekeeperRequest > ratekeeper; RequestStream< struct InitializeResolverRequest > resolver; RequestStream< struct InitializeStorageRequest > storage; RequestStream< struct InitializeLogRouterRequest > logRouter; @@ -68,7 +70,7 @@ struct WorkerInterface { template void serialize(Ar& ar) { - serializer(ar, clientInterface, locality, tLog, master, masterProxy, dataDistributor, resolver, storage, logRouter, debugPing, coordinationPing, waitFailure, setMetricsRate, eventLogRequest, traceBatchDumpRequest, testerInterface, diskStoreRequest); + serializer(ar, clientInterface, locality, tLog, master, masterProxy, dataDistributor, ratekeeper, resolver, storage, logRouter, debugPing, coordinationPing, waitFailure, setMetricsRate, eventLogRequest, traceBatchDumpRequest, testerInterface, diskStoreRequest); } }; @@ -151,6 +153,16 @@ struct InitializeDataDistributorRequest { } }; +struct InitializeRatekeeperRequest { + UID reqId; + ReplyPromise reply; + + template + void serialize(Ar& ar) { + serializer(ar, reqId, reply); + } +}; + struct InitializeResolverRequest { uint64_t recoveryCount; int proxyCount; @@ -300,6 +312,7 @@ struct Role { static const Role TESTER; static const Role LOG_ROUTER; static const Role DATA_DISTRIBUTOR; + static const Role RATE_KEEPER; std::string roleName; std::string abbreviation; @@ -361,6 +374,7 @@ ACTOR Future resolver(ResolverInterface proxy, InitializeResolverRequest i ACTOR Future logRouter(TLogInterface interf, InitializeLogRouterRequest req, Reference> db); ACTOR Future dataDistributor(DataDistributorInterface ddi, Reference> db); +ACTOR Future rateKeeper(RatekeeperInterface const& rki, Reference> const& db); void registerThreadForProfiling(); void updateCpuProfiler(ProfilerRequest req); diff --git a/fdbserver/fdbserver.vcxproj b/fdbserver/fdbserver.vcxproj index 36e73c121a..483e0e5aec 100644 --- a/fdbserver/fdbserver.vcxproj +++ b/fdbserver/fdbserver.vcxproj @@ -191,7 +191,7 @@ false - + diff --git a/fdbserver/fdbserver.vcxproj.filters b/fdbserver/fdbserver.vcxproj.filters index 9c27ac6fad..3395d97ab0 100644 --- a/fdbserver/fdbserver.vcxproj.filters +++ b/fdbserver/fdbserver.vcxproj.filters @@ -310,6 +310,7 @@ + @@ -343,7 +344,7 @@ - + diff --git a/fdbserver/masterserver.actor.cpp b/fdbserver/masterserver.actor.cpp index 6460f49403..d242ec446c 100644 --- a/fdbserver/masterserver.actor.cpp +++ b/fdbserver/masterserver.actor.cpp @@ -31,7 +31,6 @@ #include #include "fdbserver/WaitFailure.h" #include "fdbserver/WorkerInterface.actor.h" -#include "fdbserver/Ratekeeper.h" #include "fdbserver/ClusterRecruitmentInterface.h" #include "fdbserver/ServerDBInfo.h" #include "fdbserver/CoordinatedState.h" diff --git a/fdbserver/worker.actor.cpp b/fdbserver/worker.actor.cpp index c4b0dd0def..88d8ec85d0 100644 --- a/fdbserver/worker.actor.cpp +++ b/fdbserver/worker.actor.cpp @@ -349,14 +349,15 @@ ACTOR Future registrationClient( WorkerInterface interf, Reference> asyncPriorityInfo, ProcessClass initialClass, - Reference>> ddInterf) { + Reference>> ddInterf, + Reference>> rkInterf) { // Keeps the cluster controller (as it may be re-elected) informed that this worker exists // The cluster controller uses waitFailureClient to find out if we die, and returns from registrationReply (requiring us to re-register) // The registration request piggybacks optional distributor interface if it exists. state Generation requestGeneration = 0; state ProcessClass processClass = initialClass; loop { - RegisterWorkerRequest request(interf, initialClass, processClass, asyncPriorityInfo->get(), requestGeneration++, ddInterf->get()); + RegisterWorkerRequest request(interf, initialClass, processClass, asyncPriorityInfo->get(), requestGeneration++, ddInterf->get(), rkInterf->get()); Future registrationReply = ccInterface->get().present() ? brokenPromiseToNever( ccInterface->get().get().registerWorker.getReply(request) ) : Never(); choose { when ( RegisterWorkerReply reply = wait( registrationReply )) { @@ -365,6 +366,7 @@ ACTOR Future registrationClient( } when ( wait( ccInterface->onChange() )) { } when ( wait( ddInterf->onChange() ) ) {} + when ( wait( rkInterf->onChange() ) ) {} } } } @@ -610,6 +612,7 @@ ACTOR Future workerServer( Reference connFile, Refe Reference> asyncPriorityInfo, ProcessClass initialClass, std::string folder, int64_t memoryLimit, std::string metricsConnFile, std::string metricsPrefix, Promise recoveredDiskFiles) { state PromiseStream< ErrorInfo > errors; state Reference>> ddInterf( new AsyncVar>() ); + state Reference>> rkInterf( new AsyncVar>() ); state Future handleErrors = workerHandleErrors( errors.getFuture() ); // Needs to be stopped last state ActorCollection errorForwarders(false); state Future loggingTrigger = Void(); @@ -756,7 +759,7 @@ ACTOR Future workerServer( Reference connFile, Refe wait(waitForAll(recoveries)); recoveredDiskFiles.send(Void()); - errorForwarders.add( registrationClient( ccInterface, interf, asyncPriorityInfo, initialClass, ddInterf ) ); + errorForwarders.add( registrationClient( ccInterface, interf, asyncPriorityInfo, initialClass, ddInterf, rkInterf ) ); TraceEvent("RecoveriesComplete", interf.id()); @@ -837,6 +840,22 @@ ACTOR Future workerServer( Reference connFile, Refe TraceEvent("DataDistributorReceived", req.reqId).detail("DataDistributorId", recruited.id()); req.reply.send(recruited); } + when ( InitializeRatekeeperRequest req = waitNext(interf.ratekeeper.getFuture()) ) { + RatekeeperInterface recruited(locality); + recruited.initEndpoints(); + + if (rkInterf->get().present()) { + recruited = rkInterf->get().get(); + TEST(true); // Recruited while already a ratekeeper. + } else { + startRole(Role::RATE_KEEPER, recruited.id(), interf.id()); + Future ratekeeper = rateKeeper( recruited, dbInfo ); + errorForwarders.add( forwardError( errors, Role::RATE_KEEPER, recruited.id(), setWhenDoneOrError( ratekeeper, rkInterf, Optional() ) ) ); + rkInterf->set(Optional(recruited)); + } + TraceEvent("Ratekeeper_InitRequest", req.reqId).detail("RatekeeperId", recruited.id()); + req.reply.send(recruited); + } when( InitializeTLogRequest req = waitNext(interf.tLog.getFuture()) ) { // For now, there's a one-to-one mapping of spill type to TLogVersion. // With future work, a particular version of the TLog can support multiple @@ -1244,3 +1263,4 @@ const Role Role::CLUSTER_CONTROLLER("ClusterController", "CC"); const Role Role::TESTER("Tester", "TS"); const Role Role::LOG_ROUTER("LogRouter", "LR"); const Role Role::DATA_DISTRIBUTOR("DataDistributor", "DD"); +const Role Role::RATE_KEEPER("RateKeeper", "RK"); diff --git a/flow/network.h b/flow/network.h index 3b569da349..0437c5febe 100644 --- a/flow/network.h +++ b/flow/network.h @@ -67,6 +67,7 @@ enum { TaskUnknownEndpoint = 4000, TaskMoveKeys = 3550, TaskDataDistributionLaunch = 3530, + TaskRateKeeper = 3510, TaskDataDistribution = 3500, TaskDiskWrite = 3010, TaskUpdateStorage = 3000, From e6ac3f7fe8bd635c2954a8e3c8d814e69c3a36b4 Mon Sep 17 00:00:00 2001 From: Jingyu Zhou Date: Fri, 15 Feb 2019 17:29:52 -0800 Subject: [PATCH 02/46] Minor fix on ratekeeper work registration. --- fdbserver/ClusterController.actor.cpp | 17 +++++++++----- fdbserver/DataDistribution.actor.cpp | 33 +++++++++++++++++++++++++++ fdbserver/Ratekeeper.actor.cpp | 4 ++-- fdbserver/RatekeeperInterface.h | 13 +---------- 4 files changed, 47 insertions(+), 20 deletions(-) diff --git a/fdbserver/ClusterController.actor.cpp b/fdbserver/ClusterController.actor.cpp index d95b9c1385..be5bfc53e4 100644 --- a/fdbserver/ClusterController.actor.cpp +++ b/fdbserver/ClusterController.actor.cpp @@ -3,7 +3,7 @@ * * This source file is part of the FoundationDB open source project * - * Copyright 2013-2018 Apple Inc. and the FoundationDB project authors + * Copyright 2013-2019 Apple Inc. and the FoundationDB project authors * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -1769,6 +1769,11 @@ void registerWorker( RegisterWorkerRequest req, ClusterControllerData *self ) { TraceEvent("ClusterController_RegisterDataDistributor", self->id).detail("DDID", di.id()); self->db.setDistributor(di); } + if ( req.ratekeeperInterf.present() && !self->db.serverInfo->get().ratekeeper.present() ) { + const RatekeeperInterface& rki = req.ratekeeperInterf.get(); + TraceEvent("ClusterController_RegisterRatekeeper", self->id).detail("RKID", rki.id()); + self->db.setRatekeeper(rki); + } if( info == self->id_worker.end() ) { self->id_worker[w.locality.processId()] = WorkerInfo( workerAvailabilityWatch( w, newProcessClass, self ), req.reply, req.generation, w, req.initialClass, newProcessClass, newPriorityInfo ); checkOutstandingRequests( self ); @@ -2398,7 +2403,7 @@ ACTOR Future startRatekeeper(ClusterControllerData *self) { req.reqId = g_random->randomUniqueID(); TraceEvent("ClusterController_RecruitRatekeeper", req.reqId).detail("Addr", rkWorker.worker.first.address()); - ErrorOr interf = wait( rkWorker.worker.first.ratekeeper.getReplyUnlessFailedFor(req, SERVER_KNOBS->WAIT_FOR_DISTRIBUTOR_JOIN_DELAY, 0) ); + ErrorOr interf = wait( rkWorker.worker.first.ratekeeper.getReplyUnlessFailedFor(req, SERVER_KNOBS->WAIT_FOR_RATEKEEPER_JOIN_DELAY, 0) ); if (interf.present()) { TraceEvent("ClusterController_RatekeeperRecruited", req.reqId).detail("Addr", rkWorker.worker.first.address()); return interf.get(); @@ -2414,7 +2419,7 @@ ACTOR Future startRatekeeper(ClusterControllerData *self) { } } -ACTOR Future waitRKRejoinOrStartRK(ClusterControllerData *self) { +ACTOR Future monitorRatekeeper(ClusterControllerData *self) { state Future initialDelay = delay(SERVER_KNOBS->WAIT_FOR_RATEKEEPER_JOIN_DELAY); // wait for a while to see if an existing ratekeeper will join. @@ -2432,8 +2437,8 @@ ACTOR Future waitRKRejoinOrStartRK(ClusterControllerData *self) { loop { if ( self->db.serverInfo->get().ratekeeper.present() ) { wait( waitFailureClient( self->db.serverInfo->get().ratekeeper.get().waitFailure, SERVER_KNOBS->RATEKEEPER_FAILURE_TIME ) ); - TraceEvent("ClusterController", self->id) - .detail("RatekeeperDied", self->db.serverInfo->get().ratekeeper.get().id()); + TraceEvent("ClusterController_RateKeeperDied", self->id) + .detail("RKID", self->db.serverInfo->get().ratekeeper.get().id()); self->db.clearInterf(ProcessClass::RateKeeperClass); } else { RatekeeperInterface rkInterf = wait( startRatekeeper(self) ); @@ -2460,7 +2465,7 @@ ACTOR Future clusterControllerCore( ClusterControllerFullInterface interf, self.addActor.send( updateDatacenterVersionDifference(&self) ); self.addActor.send( handleForcedRecoveries(&self, interf) ); self.addActor.send( waitDDRejoinOrStartDD(&self) ); - self.addActor.send( waitRKRejoinOrStartRK(&self) ); + self.addActor.send( monitorRatekeeper(&self) ); //printf("%s: I am the cluster controller\n", g_network->getLocalAddress().toString().c_str()); loop choose { diff --git a/fdbserver/DataDistribution.actor.cpp b/fdbserver/DataDistribution.actor.cpp index fe52e5e42a..e1b80d2314 100644 --- a/fdbserver/DataDistribution.actor.cpp +++ b/fdbserver/DataDistribution.actor.cpp @@ -3454,6 +3454,39 @@ ACTOR Future dataDistribution(Reference self, state DatabaseConfiguration configuration = self->configuration->get(); cx->locationCacheSize = SERVER_KNOBS->DD_LOCATION_CACHE_SIZE; +<<<<<<< HEAD +======= + state Transaction tr(cx); + loop { + try { + tr.setOption( FDBTransactionOptions::ACCESS_SYSTEM_KEYS ); + tr.setOption( FDBTransactionOptions::PRIORITY_SYSTEM_IMMEDIATE ); + + Standalone replicaKeys = wait(tr.getRange(datacenterReplicasKeys, CLIENT_KNOBS->TOO_MANY)); + + for(auto& kv : replicaKeys) { + auto dcId = decodeDatacenterReplicasKey(kv.key); + auto replicas = decodeDatacenterReplicasValue(kv.value); + if ((self->primaryDcId.size() && self->primaryDcId[0] == dcId) || + (self->remoteDcIds.size() && self->remoteDcIds[0] == dcId && configuration.usableRegions > 1)) { + if(replicas > configuration.storageTeamSize) { + tr.set(kv.key, datacenterReplicasValue(configuration.storageTeamSize)); + } + } else { + tr.clear(kv.key); + } + } + + wait(tr.commit()); + break; + } + catch(Error &e) { + wait(tr.onError(e)); + } + } + + +>>>>>>> Minor fix on ratekeeper work registration. //cx->setOption( FDBDatabaseOptions::LOCATION_CACHE_SIZE, StringRef((uint8_t*) &SERVER_KNOBS->DD_LOCATION_CACHE_SIZE, 8) ); //ASSERT( cx->locationCacheSize == SERVER_KNOBS->DD_LOCATION_CACHE_SIZE ); diff --git a/fdbserver/Ratekeeper.actor.cpp b/fdbserver/Ratekeeper.actor.cpp index 83a8778411..f872a55566 100644 --- a/fdbserver/Ratekeeper.actor.cpp +++ b/fdbserver/Ratekeeper.actor.cpp @@ -3,7 +3,7 @@ * * This source file is part of the FoundationDB open source project * - * Copyright 2013-2018 Apple Inc. and the FoundationDB project authors + * Copyright 2013-2019 Apple Inc. and the FoundationDB project authors * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -655,7 +655,7 @@ ACTOR Future rateKeeper(RatekeeperInterface rkInterf, Reference collection = actorCollection( self.addActor.getFuture() ); // TODOs: - double lastLimited; + double lastLimited = 0; self.lastLimited = &lastLimited; TraceEvent("Ratekeeper_Starting", rkInterf.id()); diff --git a/fdbserver/RatekeeperInterface.h b/fdbserver/RatekeeperInterface.h index cb5049a595..539aeb8d7f 100644 --- a/fdbserver/RatekeeperInterface.h +++ b/fdbserver/RatekeeperInterface.h @@ -29,7 +29,6 @@ struct RatekeeperInterface { RequestStream> waitFailure; RequestStream getRateInfo; - RequestStream changeStorage; struct LocalityData locality; RatekeeperInterface() {} @@ -47,7 +46,7 @@ struct RatekeeperInterface { template void serialize(Archive& ar) { - serializer(ar, waitFailure, getRateInfo, changeStorage, locality); + serializer(ar, waitFailure, getRateInfo, locality); } }; @@ -80,14 +79,4 @@ struct GetRateInfoReply { } }; -struct StorageChangeRequest { - UID ssID; - Optional ssInterf; - - template - void serialize(Ar& ar) { - serializer(ar, ssID, ssInterf); - } -}; - #endif //FDBSERVER_RATEKEEPERINTERFACE_H From 36a51a7b57d7a6f5f92249edb26c30fa2468f794 Mon Sep 17 00:00:00 2001 From: Jingyu Zhou Date: Mon, 18 Feb 2019 14:57:21 -0800 Subject: [PATCH 03/46] Fix a segfault bug due to uncopied ratekeeper interface --- fdbserver/CMakeLists.txt | 1 - fdbserver/ClusterController.actor.cpp | 11 +++++++-- fdbserver/DataDistribution.actor.cpp | 32 +++++++++++++++++++++++++++ 3 files changed, 41 insertions(+), 3 deletions(-) diff --git a/fdbserver/CMakeLists.txt b/fdbserver/CMakeLists.txt index 58853a2fee..31af61de23 100644 --- a/fdbserver/CMakeLists.txt +++ b/fdbserver/CMakeLists.txt @@ -56,7 +56,6 @@ set(FDBSERVER_SRCS QuietDatabase.actor.cpp QuietDatabase.h Ratekeeper.actor.cpp - Ratekeeper.h RatekeeperInterface.h RecoveryState.h Restore.actor.cpp diff --git a/fdbserver/ClusterController.actor.cpp b/fdbserver/ClusterController.actor.cpp index be5bfc53e4..fef69bc7e6 100644 --- a/fdbserver/ClusterController.actor.cpp +++ b/fdbserver/ClusterController.actor.cpp @@ -936,6 +936,9 @@ public: if (db.serverInfo->get().distributor.present()) { id_used[db.serverInfo->get().distributor.get().locality.processId()]++; } + if (db.serverInfo->get().ratekeeper.present()) { + id_used[db.serverInfo->get().ratekeeper.get().locality.processId()]++; + } WorkerFitnessInfo mworker = getWorkerForRoleInDatacenter(clusterControllerDcId, ProcessClass::Master, ProcessClass::NeverAssign, db.config, id_used, true); if ( oldMasterFit < mworker.fitness ) @@ -1121,6 +1124,9 @@ ACTOR Future clusterWatchDatabase( ClusterControllerData* cluster, Cluster if (cluster->db.serverInfo->get().distributor.present()) { id_used[cluster->db.serverInfo->get().distributor.get().locality.processId()]++; } + if (cluster->db.serverInfo->get().ratekeeper.present()) { + id_used[cluster->db.serverInfo->get().ratekeeper.get().locality.processId()]++; + } state WorkerFitnessInfo masterWorker = cluster->getWorkerForRoleInDatacenter(cluster->clusterControllerDcId, ProcessClass::Master, ProcessClass::NeverAssign, db->config, id_used); if( ( masterWorker.worker.second.machineClassFitness( ProcessClass::Master ) > SERVER_KNOBS->EXPECTED_MASTER_FITNESS || masterWorker.worker.first.locality.processId() == cluster->clusterControllerProcessId ) && now() - cluster->startTime < SERVER_KNOBS->WAIT_FOR_GOOD_RECRUITMENT_DELAY ) { @@ -1156,6 +1162,7 @@ ACTOR Future clusterWatchDatabase( ClusterControllerData* cluster, Cluster ++dbInfo.masterLifetime; dbInfo.clusterInterface = db->serverInfo->get().clusterInterface; dbInfo.distributor = db->serverInfo->get().distributor; + dbInfo.ratekeeper = db->serverInfo->get().ratekeeper; TraceEvent("CCWDB", cluster->id).detail("Lifetime", dbInfo.masterLifetime.toString()).detail("ChangeID", dbInfo.id); db->serverInfo->set( dbInfo ); @@ -2361,7 +2368,7 @@ ACTOR Future startDataDistributor( ClusterControllerDa } } -ACTOR Future waitDDRejoinOrStartDD(ClusterControllerData *self) { +ACTOR Future monitorDataDistributor(ClusterControllerData *self) { state Future initialDelay = delay(SERVER_KNOBS->WAIT_FOR_DISTRIBUTOR_JOIN_DELAY); // wait for a while to see if existing data distributor will join. @@ -2464,7 +2471,7 @@ ACTOR Future clusterControllerCore( ClusterControllerFullInterface interf, self.addActor.send( updatedChangedDatacenters(&self) ); self.addActor.send( updateDatacenterVersionDifference(&self) ); self.addActor.send( handleForcedRecoveries(&self, interf) ); - self.addActor.send( waitDDRejoinOrStartDD(&self) ); + self.addActor.send( monitorDataDistributor(&self) ); self.addActor.send( monitorRatekeeper(&self) ); //printf("%s: I am the cluster controller\n", g_network->getLocalAddress().toString().c_str()); diff --git a/fdbserver/DataDistribution.actor.cpp b/fdbserver/DataDistribution.actor.cpp index e1b80d2314..a46d5d8f01 100644 --- a/fdbserver/DataDistribution.actor.cpp +++ b/fdbserver/DataDistribution.actor.cpp @@ -3447,6 +3447,38 @@ ACTOR Future pollMoveKeysLock( Database cx, MoveKeysLock lock ) { } } +<<<<<<< HEAD +======= +struct DataDistributorData : NonCopyable, ReferenceCounted { + Reference> dbInfo; + Reference> configuration; + std::vector> primaryDcId; + std::vector> remoteDcIds; + UID ddId; + PromiseStream> addActor; + + DataDistributorData(Reference> const& db, Reference> const& dbConfig, UID id) + : dbInfo(db), configuration(dbConfig), ddId(id) {} + + void refreshDcIds() { + primaryDcId.clear(); + remoteDcIds.clear(); + + const std::vector& regions = configuration->get().regions; + TraceEvent ev("DataDistributor", ddId); + if ( regions.size() > 0 ) { + primaryDcId.push_back( regions[0].dcId ); + ev.detail("PrimaryDcID", regions[0].dcId.toHexString()); + } + if ( regions.size() > 1 ) { + remoteDcIds.push_back( regions[1].dcId ); + ev.detail("SecondaryDcID", regions[1].dcId.toHexString()); + } + } +}; + +// TODO: remove lastLimited -- obtain this information of ratekeeper from proxy +>>>>>>> Fix a segfault bug due to uncopied ratekeeper interface ACTOR Future dataDistribution(Reference self, double* lastLimited) { From b2ee41ba33570f1db9b120ec3785f923a6fd68f1 Mon Sep 17 00:00:00 2001 From: Jingyu Zhou Date: Tue, 19 Feb 2019 14:04:45 -0800 Subject: [PATCH 04/46] Remove lastLimited from data distribution Fix a serialization bug in ServerDBInfo, which causes test failures. --- fdbserver/DataDistribution.actor.cpp | 24 ++++++++++++++++++++++- fdbserver/DataDistribution.actor.h | 3 +-- fdbserver/DataDistributionQueue.actor.cpp | 6 +++--- fdbserver/Ratekeeper.actor.cpp | 9 ++++++--- fdbserver/ServerDBInfo.h | 2 +- 5 files changed, 34 insertions(+), 10 deletions(-) diff --git a/fdbserver/DataDistribution.actor.cpp b/fdbserver/DataDistribution.actor.cpp index a46d5d8f01..667e606b35 100644 --- a/fdbserver/DataDistribution.actor.cpp +++ b/fdbserver/DataDistribution.actor.cpp @@ -3478,9 +3478,13 @@ struct DataDistributorData : NonCopyable, ReferenceCounted }; // TODO: remove lastLimited -- obtain this information of ratekeeper from proxy +<<<<<<< HEAD >>>>>>> Fix a segfault bug due to uncopied ratekeeper interface ACTOR Future dataDistribution(Reference self, double* lastLimited) +======= +ACTOR Future dataDistribution(Reference self) +>>>>>>> Remove lastLimited from data distribution { state Database cx = openDBOnServer(self->dbInfo, TaskDataDistributionLaunch, true, true); state DatabaseConfiguration configuration = self->configuration->get(); @@ -3671,7 +3675,7 @@ ACTOR Future dataDistribution(Reference self, actors.push_back( pollMoveKeysLock(cx, lock) ); actors.push_back( reportErrorsExcept( dataDistributionTracker( initData, cx, output, shardsAffectedByTeamFailure, getShardMetrics, getAverageShardBytes.getFuture(), readyToStart, anyZeroHealthyTeams, self->ddId ), "DDTracker", self->ddId, &normalDDQueueErrors() ) ); - actors.push_back( reportErrorsExcept( dataDistributionQueue( cx, output, input.getFuture(), getShardMetrics, processingUnhealthy, tcis, shardsAffectedByTeamFailure, lock, getAverageShardBytes, self->ddId, storageTeamSize, lastLimited ), "DDQueue", self->ddId, &normalDDQueueErrors() ) ); + actors.push_back( reportErrorsExcept( dataDistributionQueue( cx, output, input.getFuture(), getShardMetrics, processingUnhealthy, tcis, shardsAffectedByTeamFailure, lock, getAverageShardBytes, self->ddId, storageTeamSize ), "DDQueue", self->ddId, &normalDDQueueErrors() ) ); vector teamCollectionsPtrs; Reference primaryTeamCollection( new DDTeamCollection(cx, self->ddId, lock, output, shardsAffectedByTeamFailure, configuration, self->primaryDcId, configuration.usableRegions > 1 ? self->remoteDcIds : std::vector>(), readyToStart.getFuture(), zeroHealthyTeams[0], true, processingUnhealthy) ); @@ -3730,10 +3734,28 @@ ACTOR Future dataDistributor(DataDistributorInterface di, Reference distributor = reportErrorsExcept( dataDistribution( self->dbInfo, &lastLimited ), "DataDistribution", di.id(), &normalDataDistributorErrors() ); wait( distributor || collection ); +======= + state Future distributor = reportErrorsExcept( dataDistribution(self), "DataDistribution", di.id(), &normalDataDistributorErrors() ); + + loop choose { + when ( wait( self->configuration->onChange() ) ) { + TraceEvent("DataDistributor_Restart", di.id()) + .detail("Configuration", self->configuration->get().toString()); + self->refreshDcIds(); + distributor = reportErrorsExcept( dataDistribution(self), "DataDistribution", di.id(), &normalDataDistributorErrors() ); + } + when ( wait( collection ) ) { + ASSERT(false); + throw internal_error(); + } + when ( wait( distributor ) ) {} + } +>>>>>>> Remove lastLimited from data distribution } catch ( Error &err ) { if ( normalDataDistributorErrors().count(err.code()) == 0 ) { diff --git a/fdbserver/DataDistribution.actor.h b/fdbserver/DataDistribution.actor.h index 1baff7f58e..109c57878b 100644 --- a/fdbserver/DataDistribution.actor.h +++ b/fdbserver/DataDistribution.actor.h @@ -230,8 +230,7 @@ Future dataDistributionQueue( MoveKeysLock const& lock, PromiseStream> const& getAverageShardBytes, UID const& distributorId, - int const& teamSize, - double* const& lastLimited); + int const& teamSize); //Holds the permitted size and IO Bounds for a shard struct ShardSizeBounds { diff --git a/fdbserver/DataDistributionQueue.actor.cpp b/fdbserver/DataDistributionQueue.actor.cpp index f282ef12bc..f0b8d28d41 100644 --- a/fdbserver/DataDistributionQueue.actor.cpp +++ b/fdbserver/DataDistributionQueue.actor.cpp @@ -1201,10 +1201,10 @@ ACTOR Future dataDistributionQueue( MoveKeysLock lock, PromiseStream> getAverageShardBytes, UID distributorId, - int teamSize, - double* lastLimited) + int teamSize) { - state DDQueueData self( distributorId, lock, cx, teamCollections, shardsAffectedByTeamFailure, getAverageShardBytes, teamSize, output, input, getShardMetrics, lastLimited ); + state double lastLimited = 0; + state DDQueueData self( distributorId, lock, cx, teamCollections, shardsAffectedByTeamFailure, getAverageShardBytes, teamSize, output, input, getShardMetrics, &lastLimited ); state std::set serversToLaunchFrom; state KeyRange keysToLaunchFrom; state RelocateData launchData; diff --git a/fdbserver/Ratekeeper.actor.cpp b/fdbserver/Ratekeeper.actor.cpp index f872a55566..81daee2629 100644 --- a/fdbserver/Ratekeeper.actor.cpp +++ b/fdbserver/Ratekeeper.actor.cpp @@ -648,7 +648,6 @@ ACTOR Future configurationMonitor(Reference> dbInfo ACTOR Future rateKeeper(RatekeeperInterface rkInterf, Reference> dbInfo) { state RatekeeperData self; state Future timeout = Void(); - state std::vector> actors; state std::vector> tlogTrackers; state std::vector tlogInterfs; state Promise err; @@ -676,8 +675,8 @@ ACTOR Future rateKeeper(RatekeeperInterface rkInterf, Reference rateKeeper(RatekeeperInterface rkInterf, Reference void serialize( Ar& ar ) { - serializer(ar, id, clusterInterface, client, distributor, master, resolvers, recoveryCount, recoveryState, masterLifetime, logSystemConfig, priorCommittedLogServers, latencyBandConfig); + serializer(ar, id, clusterInterface, client, distributor, master, ratekeeper, resolvers, recoveryCount, recoveryState, masterLifetime, logSystemConfig, priorCommittedLogServers, latencyBandConfig); } }; From d52ff738c0dba2b4003cd1bb69bd89d62dfa921e Mon Sep 17 00:00:00 2001 From: Jingyu Zhou Date: Tue, 19 Feb 2019 21:05:24 -0800 Subject: [PATCH 05/46] Fix merge conflicts during rebase. --- fdbserver/DataDistribution.actor.cpp | 9 +++++++-- fdbserver/RatekeeperInterface.h | 2 +- fdbserver/WorkerInterface.actor.h | 2 +- 3 files changed, 9 insertions(+), 4 deletions(-) diff --git a/fdbserver/DataDistribution.actor.cpp b/fdbserver/DataDistribution.actor.cpp index 667e606b35..7a18ee8a32 100644 --- a/fdbserver/DataDistribution.actor.cpp +++ b/fdbserver/DataDistribution.actor.cpp @@ -3535,9 +3535,10 @@ ACTOR Future dataDistribution(Reference self) loop { try { loop { - TraceEvent("DDInitTakingMoveKeysLock", myId); - MoveKeysLock lock_ = wait( takeMoveKeysLock( cx, myId ) ); + TraceEvent("DDInitTakingMoveKeysLock", self->ddId); + MoveKeysLock lock_ = wait( takeMoveKeysLock( cx, self->ddId ) ); lock = lock_; +<<<<<<< HEAD TraceEvent("DDInitTookMoveKeysLock", myId); DatabaseConfiguration configuration_ = wait( getDatabaseConfiguration(cx) ); @@ -3584,6 +3585,10 @@ ACTOR Future dataDistribution(Reference self) TraceEvent("DDInitUpdatedReplicaKeys", myId); Reference initData_ = wait( getInitialDataDistribution(cx, myId, lock, configuration.usableRegions > 1 ? remoteDcIds : std::vector>() ) ); +======= + TraceEvent("DDInitTookMoveKeysLock", self->ddId); + Reference initData_ = wait( getInitialDataDistribution(cx, self->ddId, lock, configuration.usableRegions > 1 ? self->remoteDcIds : std::vector>() ) ); +>>>>>>> Fix merge conflicts during rebase. initData = initData_; if(initData->shards.size() > 1) { TraceEvent("DDInitGotInitialDD", self->ddId) diff --git a/fdbserver/RatekeeperInterface.h b/fdbserver/RatekeeperInterface.h index 539aeb8d7f..36a47e167a 100644 --- a/fdbserver/RatekeeperInterface.h +++ b/fdbserver/RatekeeperInterface.h @@ -36,7 +36,7 @@ struct RatekeeperInterface { void initEndpoints() {} UID id() const { return getRateInfo.getEndpoint().token; } - NetworkAddress address() const { return getRateInfo.getEndpoint().address; } + NetworkAddress address() const { return getRateInfo.getEndpoint().getPrimaryAddress(); } bool operator== (const RatekeeperInterface& r) const { return id() == r.id(); } diff --git a/fdbserver/WorkerInterface.actor.h b/fdbserver/WorkerInterface.actor.h index 494ff3fcd3..370d780127 100644 --- a/fdbserver/WorkerInterface.actor.h +++ b/fdbserver/WorkerInterface.actor.h @@ -374,7 +374,7 @@ ACTOR Future resolver(ResolverInterface proxy, InitializeResolverRequest i ACTOR Future logRouter(TLogInterface interf, InitializeLogRouterRequest req, Reference> db); ACTOR Future dataDistributor(DataDistributorInterface ddi, Reference> db); -ACTOR Future rateKeeper(RatekeeperInterface const& rki, Reference> const& db); +ACTOR Future rateKeeper(RatekeeperInterface rki, Reference> db); void registerThreadForProfiling(); void updateCpuProfiler(ProfilerRequest req); From 517966fce219e484d90512108d09fdfeada848eb Mon Sep 17 00:00:00 2001 From: Jingyu Zhou Date: Wed, 20 Feb 2019 15:47:55 -0800 Subject: [PATCH 06/46] Remove lastLimited from rate keeper Refactor code to make IDE happy. --- fdbserver/ClusterController.actor.cpp | 14 ++++++++------ fdbserver/Ratekeeper.actor.cpp | 5 ----- fdbserver/WorkerInterface.actor.h | 4 ++++ 3 files changed, 12 insertions(+), 11 deletions(-) diff --git a/fdbserver/ClusterController.actor.cpp b/fdbserver/ClusterController.actor.cpp index fef69bc7e6..c4b0213eeb 100644 --- a/fdbserver/ClusterController.actor.cpp +++ b/fdbserver/ClusterController.actor.cpp @@ -2336,11 +2336,11 @@ ACTOR Future handleForcedRecoveries( ClusterControllerData *self, ClusterC ACTOR Future startDataDistributor( ClusterControllerData *self ) { state Optional dcId = self->clusterControllerDcId; - state InitializeDataDistributorRequest req; while ( !self->clusterControllerProcessId.present() || !self->masterProcessId.present() ) { wait( delay(SERVER_KNOBS->WAIT_FOR_GOOD_RECRUITMENT_DELAY) ); } + state UID reqId; loop { try { while ( self->db.serverInfo->get().recoveryState < RecoveryState::ACCEPTING_COMMITS ) { @@ -2349,7 +2349,8 @@ ACTOR Future startDataDistributor( ClusterControllerDa std::map>, int> id_used = self->getUsedIds(); state WorkerFitnessInfo data_distributor = self->getWorkerForRoleInDatacenter(dcId, ProcessClass::DataDistributor, ProcessClass::NeverAssign, self->db.config, id_used); - req.reqId = g_random->randomUniqueID(); + reqId = g_random->randomUniqueID(); + state InitializeDataDistributorRequest req(reqId); TraceEvent("ClusterController_DataDistributorRecruit", req.reqId).detail("Addr", data_distributor.worker.first.address()); ErrorOr distributor = wait( data_distributor.worker.first.dataDistributor.getReplyUnlessFailedFor(req, SERVER_KNOBS->WAIT_FOR_DISTRIBUTOR_JOIN_DELAY, 0) ); @@ -2359,7 +2360,7 @@ ACTOR Future startDataDistributor( ClusterControllerDa } } catch (Error& e) { - TraceEvent("ClusterController_DataDistributorRecruitError", req.reqId).error(e); + TraceEvent("ClusterController_DataDistributorRecruitError", reqId).error(e); if ( e.code() != error_code_no_more_servers ) { throw; } @@ -2397,6 +2398,7 @@ ACTOR Future monitorDataDistributor(ClusterControllerData *self) { } ACTOR Future startRatekeeper(ClusterControllerData *self) { + state UID reqId; loop { try { while ( self->db.serverInfo->get().recoveryState < RecoveryState::ACCEPTING_COMMITS ) { @@ -2406,8 +2408,8 @@ ACTOR Future startRatekeeper(ClusterControllerData *self) { std::map>, int> id_used = self->getUsedIds(); Optional dcId = self->clusterControllerDcId; state WorkerFitnessInfo rkWorker = self->getWorkerForRoleInDatacenter(dcId, ProcessClass::RateKeeper, ProcessClass::NeverAssign, self->db.config, id_used); - state InitializeRatekeeperRequest req; - req.reqId = g_random->randomUniqueID(); + reqId = g_random->randomUniqueID(); + state InitializeRatekeeperRequest req(reqId); TraceEvent("ClusterController_RecruitRatekeeper", req.reqId).detail("Addr", rkWorker.worker.first.address()); ErrorOr interf = wait( rkWorker.worker.first.ratekeeper.getReplyUnlessFailedFor(req, SERVER_KNOBS->WAIT_FOR_RATEKEEPER_JOIN_DELAY, 0) ); @@ -2417,7 +2419,7 @@ ACTOR Future startRatekeeper(ClusterControllerData *self) { } } catch (Error& e) { - TraceEvent("ClusterController_RatekeeperRecruitError", req.reqId).error(e); + TraceEvent("ClusterController_RatekeeperRecruitError", reqId).error(e); if ( e.code() != error_code_no_more_servers ) { throw; } diff --git a/fdbserver/Ratekeeper.actor.cpp b/fdbserver/Ratekeeper.actor.cpp index 81daee2629..ec6150a87e 100644 --- a/fdbserver/Ratekeeper.actor.cpp +++ b/fdbserver/Ratekeeper.actor.cpp @@ -160,7 +160,6 @@ struct RatekeeperData { Int64MetricHandle actualTpsMetric; double lastWarning; - double* lastLimited; RatekeeperLimits normalLimits; RatekeeperLimits batchLimits; @@ -653,10 +652,6 @@ ACTOR Future rateKeeper(RatekeeperInterface rkInterf, Reference err; state Future collection = actorCollection( self.addActor.getFuture() ); - // TODOs: - double lastLimited = 0; - self.lastLimited = &lastLimited; - TraceEvent("Ratekeeper_Starting", rkInterf.id()); self.addActor.send( waitFailureServer(rkInterf.waitFailure.getFuture()) ); self.addActor.send( configurationMonitor(dbInfo, &self.configuration) ); diff --git a/fdbserver/WorkerInterface.actor.h b/fdbserver/WorkerInterface.actor.h index 370d780127..f79a96f623 100644 --- a/fdbserver/WorkerInterface.actor.h +++ b/fdbserver/WorkerInterface.actor.h @@ -147,6 +147,8 @@ struct InitializeDataDistributorRequest { UID reqId; ReplyPromise reply; + InitializeDataDistributorRequest() {} + explicit InitializeDataDistributorRequest(UID uid) : reqId(uid) {} template void serialize( Ar& ar ) { serializer(ar, reqId, reply); @@ -157,6 +159,8 @@ struct InitializeRatekeeperRequest { UID reqId; ReplyPromise reply; + InitializeRatekeeperRequest() {} + explicit InitializeRatekeeperRequest(UID uid) : reqId(uid) {} template void serialize(Ar& ar) { serializer(ar, reqId, reply); From 5dcde9efe0a6cb6ce22bd3b90de4507a3e887715 Mon Sep 17 00:00:00 2001 From: Jingyu Zhou Date: Thu, 21 Feb 2019 15:30:39 -0800 Subject: [PATCH 07/46] Fix locality per review comment and a mac compile error --- fdbrpc/Locality.cpp | 66 ++++++++++++++++++--------------------------- 1 file changed, 26 insertions(+), 40 deletions(-) diff --git a/fdbrpc/Locality.cpp b/fdbrpc/Locality.cpp index ff9135d77e..caf3fa1d70 100644 --- a/fdbrpc/Locality.cpp +++ b/fdbrpc/Locality.cpp @@ -164,49 +164,35 @@ ProcessClass::Fitness ProcessClass::machineClassFitness( ClusterRole role ) cons } case ProcessClass::DataDistributor: switch( _class ) { - case ProcessClass::DataDistributorClass: - return ProcessClass::BestFit; - case ProcessClass::StatelessClass: - return ProcessClass::GoodFit; - case ProcessClass::MasterClass: - return ProcessClass::OkayFit; - case ProcessClass::ResolutionClass: - return ProcessClass::OkayFit; - case ProcessClass::TransactionClass: - return ProcessClass::OkayFit; - case ProcessClass::ProxyClass: - return ProcessClass::OkayFit; - case ProcessClass::UnsetClass: - return ProcessClass::UnsetFit; - case ProcessClass::CoordinatorClass: - return ProcessClass::NeverAssign; - case ProcessClass::TesterClass: - return ProcessClass::NeverAssign; - default: - return ProcessClass::WorstFit; + case ProcessClass::DataDistributorClass: + return ProcessClass::BestFit; + case ProcessClass::StatelessClass: + return ProcessClass::GoodFit; + case ProcessClass::MasterClass: + return ProcessClass::OkayFit; + case ProcessClass::UnsetClass: + return ProcessClass::UnsetFit; + case ProcessClass::CoordinatorClass: + case ProcessClass::TesterClass: + return ProcessClass::NeverAssign; + default: + return ProcessClass::WorstFit; } case ProcessClass::RateKeeper: switch( _class ) { - case ProcessClass::RateKeeperClass: - return ProcessClass::BestFit; - case ProcessClass::StatelessClass: - return ProcessClass::GoodFit; - case ProcessClass::MasterClass: - return ProcessClass::OkayFit; - case ProcessClass::ResolutionClass: - return ProcessClass::OkayFit; - case ProcessClass::TransactionClass: - return ProcessClass::OkayFit; - case ProcessClass::ProxyClass: - return ProcessClass::OkayFit; - case ProcessClass::UnsetClass: - return ProcessClass::UnsetFit; - case ProcessClass::CoordinatorClass: - return ProcessClass::NeverAssign; - case ProcessClass::TesterClass: - return ProcessClass::NeverAssign; - default: - return ProcessClass::WorstFit; + case ProcessClass::RateKeeperClass: + return ProcessClass::BestFit; + case ProcessClass::StatelessClass: + return ProcessClass::GoodFit; + case ProcessClass::MasterClass: + return ProcessClass::OkayFit; + case ProcessClass::UnsetClass: + return ProcessClass::UnsetFit; + case ProcessClass::CoordinatorClass: + case ProcessClass::TesterClass: + return ProcessClass::NeverAssign; + default: + return ProcessClass::WorstFit; } default: return ProcessClass::NeverAssign; From 734099826125f3d6081126a6a4d5f4a9bcfb20ea Mon Sep 17 00:00:00 2001 From: Jingyu Zhou Date: Fri, 22 Feb 2019 15:04:38 -0800 Subject: [PATCH 08/46] Fix status message for ratekeeper --- documentation/sphinx/source/mr-status.rst | 1 + fdbclient/Schemas.cpp | 1 + fdbserver/Status.actor.cpp | 21 +++++++++++++++++---- 3 files changed, 19 insertions(+), 4 deletions(-) diff --git a/documentation/sphinx/source/mr-status.rst b/documentation/sphinx/source/mr-status.rst index 61d2103b16..441624f4ad 100644 --- a/documentation/sphinx/source/mr-status.rst +++ b/documentation/sphinx/source/mr-status.rst @@ -339,6 +339,7 @@ cluster.messages log_servers_error Time cluster.messages transaction_start_timeout Unable to start transaction after __ seconds. cluster.messages unreachable_master_worker Unable to locate the master worker. cluster.messages unreachable_dataDistributor_worker Unable to locate the data distributor worker. +cluster.messages unreachable_ratekeeper_worker Unable to locate the ratekeeper worker. cluster.messages unreachable_processes The cluster has some unreachable processes. cluster.messages unreadable_configuration Unable to read database configuration. cluster.messages layer_status_incomplete Some or all of the layers subdocument could not be read. diff --git a/fdbclient/Schemas.cpp b/fdbclient/Schemas.cpp index 769d7e8d25..34b8f1826d 100644 --- a/fdbclient/Schemas.cpp +++ b/fdbclient/Schemas.cpp @@ -324,6 +324,7 @@ const KeyRef JSONSchemas::statusSchema = LiteralStringRef(R"statusSchema( "$enum":[ "unreachable_master_worker", "unreachable_dataDistributor_worker", + "unreachable_ratekeeper_worker", "unreadable_configuration", "full_replication_timeout", "client_issues", diff --git a/fdbserver/Status.actor.cpp b/fdbserver/Status.actor.cpp index 0b48d7fcf2..374d0728e3 100644 --- a/fdbserver/Status.actor.cpp +++ b/fdbserver/Status.actor.cpp @@ -1387,7 +1387,7 @@ JsonBuilderObject getPerfLimit(TraceEventFields const& ratekeeper, double transP return perfLimit; } -ACTOR static Future workloadStatusFetcher(Reference> db, vector> workers, std::pair mWorker, std::pair ddWorker, +ACTOR static Future workloadStatusFetcher(Reference> db, vector> workers, std::pair mWorker, std::pair rkWorker, JsonBuilderObject *qos, JsonBuilderObject *data_overlay, std::set *incomplete_reasons, Future>>> storageServerFuture) { state JsonBuilderObject statusObj; @@ -1439,8 +1439,8 @@ ACTOR static Future workloadStatusFetcher(Reference clusterGetStatus( state std::set status_incomplete_reasons; state std::pair mWorker; state std::pair ddWorker; // DataDistributor worker + state std::pair rkWorker; // RateKeeper worker try { // Get the master Worker interface @@ -1837,6 +1838,18 @@ ACTOR Future clusterGetStatus( ddWorker = _ddWorker.get(); } + // Get the RateKeeper worker interface + Optional> _rkWorker; + if (db->get().ratekeeper.present()) { + _rkWorker = getWorker( workers, db->get().ratekeeper.get().address() ); + } + + if (!db->get().ratekeeper.present() || !_rkWorker.present()) { + messages.push_back(JsonString::makeMessage("unreachable_ratekeeper_worker", "Unable to locate the ratekeeper worker.")); + } else { + rkWorker = _rkWorker.get(); + } + // Get latest events for various event types from ALL workers // WorkerEvents is a map of worker's NetworkAddress to its event string // The pair represents worker responses and a set of worker NetworkAddress strings which did not respond @@ -1940,7 +1953,7 @@ ACTOR Future clusterGetStatus( state int minReplicasRemaining = -1; std::vector> futures2; futures2.push_back(dataStatusFetcher(ddWorker, &minReplicasRemaining)); - futures2.push_back(workloadStatusFetcher(db, workers, mWorker, ddWorker, &qos, &data_overlay, &status_incomplete_reasons, storageServerFuture)); + futures2.push_back(workloadStatusFetcher(db, workers, mWorker, rkWorker, &qos, &data_overlay, &status_incomplete_reasons, storageServerFuture)); futures2.push_back(layerStatusFetcher(cx, &messages, &status_incomplete_reasons)); futures2.push_back(lockedStatusFetcher(db, &messages, &status_incomplete_reasons)); From 835cc278c33819739520d2ce35cd7c84ce2d9db2 Mon Sep 17 00:00:00 2001 From: Jingyu Zhou Date: Fri, 22 Feb 2019 16:36:07 -0800 Subject: [PATCH 09/46] Fix rebase conflicts. --- fdbserver/DataDistribution.actor.cpp | 115 +++------------------------ 1 file changed, 9 insertions(+), 106 deletions(-) diff --git a/fdbserver/DataDistribution.actor.cpp b/fdbserver/DataDistribution.actor.cpp index 7a18ee8a32..d664e07249 100644 --- a/fdbserver/DataDistribution.actor.cpp +++ b/fdbserver/DataDistribution.actor.cpp @@ -3447,82 +3447,19 @@ ACTOR Future pollMoveKeysLock( Database cx, MoveKeysLock lock ) { } } -<<<<<<< HEAD -======= struct DataDistributorData : NonCopyable, ReferenceCounted { Reference> dbInfo; - Reference> configuration; - std::vector> primaryDcId; - std::vector> remoteDcIds; UID ddId; PromiseStream> addActor; - DataDistributorData(Reference> const& db, Reference> const& dbConfig, UID id) - : dbInfo(db), configuration(dbConfig), ddId(id) {} - - void refreshDcIds() { - primaryDcId.clear(); - remoteDcIds.clear(); - - const std::vector& regions = configuration->get().regions; - TraceEvent ev("DataDistributor", ddId); - if ( regions.size() > 0 ) { - primaryDcId.push_back( regions[0].dcId ); - ev.detail("PrimaryDcID", regions[0].dcId.toHexString()); - } - if ( regions.size() > 1 ) { - remoteDcIds.push_back( regions[1].dcId ); - ev.detail("SecondaryDcID", regions[1].dcId.toHexString()); - } - } + DataDistributorData(Reference> const& db, UID id) : dbInfo(db), ddId(id) {} }; -// TODO: remove lastLimited -- obtain this information of ratekeeper from proxy -<<<<<<< HEAD ->>>>>>> Fix a segfault bug due to uncopied ratekeeper interface -ACTOR Future dataDistribution(Reference self, - double* lastLimited) -======= ACTOR Future dataDistribution(Reference self) ->>>>>>> Remove lastLimited from data distribution { state Database cx = openDBOnServer(self->dbInfo, TaskDataDistributionLaunch, true, true); - state DatabaseConfiguration configuration = self->configuration->get(); cx->locationCacheSize = SERVER_KNOBS->DD_LOCATION_CACHE_SIZE; -<<<<<<< HEAD -======= - state Transaction tr(cx); - loop { - try { - tr.setOption( FDBTransactionOptions::ACCESS_SYSTEM_KEYS ); - tr.setOption( FDBTransactionOptions::PRIORITY_SYSTEM_IMMEDIATE ); - - Standalone replicaKeys = wait(tr.getRange(datacenterReplicasKeys, CLIENT_KNOBS->TOO_MANY)); - - for(auto& kv : replicaKeys) { - auto dcId = decodeDatacenterReplicasKey(kv.key); - auto replicas = decodeDatacenterReplicasValue(kv.value); - if ((self->primaryDcId.size() && self->primaryDcId[0] == dcId) || - (self->remoteDcIds.size() && self->remoteDcIds[0] == dcId && configuration.usableRegions > 1)) { - if(replicas > configuration.storageTeamSize) { - tr.set(kv.key, datacenterReplicasValue(configuration.storageTeamSize)); - } - } else { - tr.clear(kv.key); - } - } - - wait(tr.commit()); - break; - } - catch(Error &e) { - wait(tr.onError(e)); - } - } - - ->>>>>>> Minor fix on ratekeeper work registration. //cx->setOption( FDBDatabaseOptions::LOCATION_CACHE_SIZE, StringRef((uint8_t*) &SERVER_KNOBS->DD_LOCATION_CACHE_SIZE, 8) ); //ASSERT( cx->locationCacheSize == SERVER_KNOBS->DD_LOCATION_CACHE_SIZE ); @@ -3538,8 +3475,7 @@ ACTOR Future dataDistribution(Reference self) TraceEvent("DDInitTakingMoveKeysLock", self->ddId); MoveKeysLock lock_ = wait( takeMoveKeysLock( cx, self->ddId ) ); lock = lock_; -<<<<<<< HEAD - TraceEvent("DDInitTookMoveKeysLock", myId); + TraceEvent("DDInitTookMoveKeysLock", self->ddId); DatabaseConfiguration configuration_ = wait( getDatabaseConfiguration(cx) ); configuration = configuration_; @@ -3553,7 +3489,7 @@ ACTOR Future dataDistribution(Reference self) remoteDcIds.push_back( regions[1].dcId ); } - TraceEvent("DDInitGotConfiguration", myId).detail("Conf", configuration.toString()); + TraceEvent("DDInitGotConfiguration", self->ddId).detail("Conf", configuration.toString()); state Transaction tr(cx); loop { @@ -3583,12 +3519,8 @@ ACTOR Future dataDistribution(Reference self) } } - TraceEvent("DDInitUpdatedReplicaKeys", myId); - Reference initData_ = wait( getInitialDataDistribution(cx, myId, lock, configuration.usableRegions > 1 ? remoteDcIds : std::vector>() ) ); -======= - TraceEvent("DDInitTookMoveKeysLock", self->ddId); - Reference initData_ = wait( getInitialDataDistribution(cx, self->ddId, lock, configuration.usableRegions > 1 ? self->remoteDcIds : std::vector>() ) ); ->>>>>>> Fix merge conflicts during rebase. + TraceEvent("DDInitUpdatedReplicaKeys", self->ddId); + Reference initData_ = wait( getInitialDataDistribution(cx, self->ddId, lock, configuration.usableRegions > 1 ? remoteDcIds : std::vector>() ) ); initData = initData_; if(initData->shards.size() > 1) { TraceEvent("DDInitGotInitialDD", self->ddId) @@ -3683,10 +3615,10 @@ ACTOR Future dataDistribution(Reference self) actors.push_back( reportErrorsExcept( dataDistributionQueue( cx, output, input.getFuture(), getShardMetrics, processingUnhealthy, tcis, shardsAffectedByTeamFailure, lock, getAverageShardBytes, self->ddId, storageTeamSize ), "DDQueue", self->ddId, &normalDDQueueErrors() ) ); vector teamCollectionsPtrs; - Reference primaryTeamCollection( new DDTeamCollection(cx, self->ddId, lock, output, shardsAffectedByTeamFailure, configuration, self->primaryDcId, configuration.usableRegions > 1 ? self->remoteDcIds : std::vector>(), readyToStart.getFuture(), zeroHealthyTeams[0], true, processingUnhealthy) ); + Reference primaryTeamCollection( new DDTeamCollection(cx, self->ddId, lock, output, shardsAffectedByTeamFailure, configuration, primaryDcId, configuration.usableRegions > 1 ? remoteDcIds : std::vector>(), readyToStart.getFuture(), zeroHealthyTeams[0], true, processingUnhealthy) ); teamCollectionsPtrs.push_back(primaryTeamCollection.getPtr()); if (configuration.usableRegions > 1) { - Reference remoteTeamCollection( new DDTeamCollection(cx, self->ddId, lock, output, shardsAffectedByTeamFailure, configuration, self->remoteDcIds, Optional>>(), readyToStart.getFuture() && remoteRecovered(self->dbInfo), zeroHealthyTeams[1], false, processingUnhealthy) ); + Reference remoteTeamCollection( new DDTeamCollection(cx, self->ddId, lock, output, shardsAffectedByTeamFailure, configuration, remoteDcIds, Optional>>(), readyToStart.getFuture() && remoteRecovered(self->dbInfo), zeroHealthyTeams[1], false, processingUnhealthy) ); teamCollectionsPtrs.push_back(remoteTeamCollection.getPtr()); remoteTeamCollection->teamCollections = teamCollectionsPtrs; actors.push_back( reportErrorsExcept( dataDistributionTeamCollection( remoteTeamCollection, initData, tcis[1], self->dbInfo ), "DDTeamCollectionSecondary", self->ddId, &normalDDQueueErrors() ) ); @@ -3710,14 +3642,6 @@ ACTOR Future dataDistribution(Reference self) } } -struct DataDistributorData : NonCopyable, ReferenceCounted { - Reference> dbInfo; - UID ddId; - PromiseStream> addActor; - - DataDistributorData(Reference> const& db, UID id) : dbInfo(db), ddId(id) {} -}; - static std::set const& normalDataDistributorErrors() { static std::set s; if (s.empty()) { @@ -3734,33 +3658,12 @@ ACTOR Future dataDistributor(DataDistributorInterface di, Reference self( new DataDistributorData(db, di.id()) ); state Future collection = actorCollection( self->addActor.getFuture() ); - TraceEvent("DataDistributor_Starting", di.id()); - self->addActor.send( waitFailureServer(di.waitFailure.getFuture()) ); - try { TraceEvent("DataDistributor_Running", di.id()); -<<<<<<< HEAD - state double lastLimited = 0; - state Future distributor = reportErrorsExcept( dataDistribution( self->dbInfo, &lastLimited ), "DataDistribution", di.id(), &normalDataDistributorErrors() ); - - wait( distributor || collection ); -======= + self->addActor.send( waitFailureServer(di.waitFailure.getFuture()) ); state Future distributor = reportErrorsExcept( dataDistribution(self), "DataDistribution", di.id(), &normalDataDistributorErrors() ); - loop choose { - when ( wait( self->configuration->onChange() ) ) { - TraceEvent("DataDistributor_Restart", di.id()) - .detail("Configuration", self->configuration->get().toString()); - self->refreshDcIds(); - distributor = reportErrorsExcept( dataDistribution(self), "DataDistribution", di.id(), &normalDataDistributorErrors() ); - } - when ( wait( collection ) ) { - ASSERT(false); - throw internal_error(); - } - when ( wait( distributor ) ) {} - } ->>>>>>> Remove lastLimited from data distribution + wait( distributor || collection ); } catch ( Error &err ) { if ( normalDataDistributorErrors().count(err.code()) == 0 ) { From dc129207a92098126260c627c57704d02b1d6a3a Mon Sep 17 00:00:00 2001 From: Jingyu Zhou Date: Wed, 27 Feb 2019 11:51:48 -0800 Subject: [PATCH 10/46] Minor fix after rebase. --- fdbserver/DataDistributorInterface.h | 1 - fdbserver/MasterProxyServer.actor.cpp | 12 ------------ fdbserver/Ratekeeper.actor.cpp | 7 +------ fdbserver/RatekeeperInterface.h | 1 - 4 files changed, 1 insertion(+), 20 deletions(-) diff --git a/fdbserver/DataDistributorInterface.h b/fdbserver/DataDistributorInterface.h index d437fc69ae..4c2f68f83d 100644 --- a/fdbserver/DataDistributorInterface.h +++ b/fdbserver/DataDistributorInterface.h @@ -21,7 +21,6 @@ #ifndef FDBSERVER_DATADISTRIBUTORINTERFACE_H #define FDBSERVER_DATADISTRIBUTORINTERFACE_H -#include "fdbclient/FDBTypes.h" #include "fdbrpc/fdbrpc.h" #include "fdbrpc/Locality.h" diff --git a/fdbserver/MasterProxyServer.actor.cpp b/fdbserver/MasterProxyServer.actor.cpp index 2b3299572e..4cbe878eb6 100644 --- a/fdbserver/MasterProxyServer.actor.cpp +++ b/fdbserver/MasterProxyServer.actor.cpp @@ -76,17 +76,6 @@ struct ProxyStats { } }; -ACTOR template -Future forwardValue(Promise out, Future in) -{ - // Like forwardPromise, but throws on error - T t = wait(in); - out.send(t); - return Void(); -} - -int getBytes(Promise const& r) { return 0; } - ACTOR Future getRate(UID myID, Reference> db, int64_t* inTransactionCount, int64_t* inBatchTransactionCount, double* outTransactionRate, double* outBatchTransactionRate, GetHealthMetricsReply* healthMetricsReply, GetHealthMetricsReply* detailedHealthMetricsReply) { state Future nextRequestTimer = Never(); @@ -94,7 +83,6 @@ ACTOR Future getRate(UID myID, Reference> db, int64 state Future reply = Never(); state double lastDetailedReply = 0.0; // request detailed metrics immediately state bool expectingDetailedReply = false; - state int64_t lastTC = 0; if (db->get().ratekeeper.present()) nextRequestTimer = Void(); diff --git a/fdbserver/Ratekeeper.actor.cpp b/fdbserver/Ratekeeper.actor.cpp index ec6150a87e..00ea50e850 100644 --- a/fdbserver/Ratekeeper.actor.cpp +++ b/fdbserver/Ratekeeper.actor.cpp @@ -24,7 +24,7 @@ #include "fdbrpc/simulator.h" #include "fdbclient/ReadYourWrites.h" #include "fdbserver/Knobs.h" -#include "fdbserver/DataDistribution.h" +#include "fdbserver/DataDistribution.actor.h" #include "fdbserver/ServerDBInfo.h" #include "fdbserver/WaitFailure.h" #include "flow/actorcompiler.h" // This must be the last #include. @@ -676,11 +676,6 @@ ACTOR Future rateKeeper(RatekeeperInterface rkInterf, Reference SERVER_KNOBS->LAST_LIMITED_RATIO * self.batchLimits.tpsLimit) { - *self.lastLimited = now(); - } - - double tooOld = now() - 1.0; for(auto p=self.proxy_transactionCounts.begin(); p!=self.proxy_transactionCounts.end(); ) { if (p->second.time < tooOld) diff --git a/fdbserver/RatekeeperInterface.h b/fdbserver/RatekeeperInterface.h index 36a47e167a..c50447d544 100644 --- a/fdbserver/RatekeeperInterface.h +++ b/fdbserver/RatekeeperInterface.h @@ -21,7 +21,6 @@ #ifndef FDBSERVER_RATEKEEPERINTERFACE_H #define FDBSERVER_RATEKEEPERINTERFACE_H -#include "fdbclient/StorageServerInterface.h" #include "fdbclient/FDBTypes.h" #include "fdbrpc/fdbrpc.h" #include "fdbrpc/Locality.h" From f43277e8192ee71c62a0492f55e6002d96b6106b Mon Sep 17 00:00:00 2001 From: Jingyu Zhou Date: Wed, 6 Mar 2019 10:46:17 -0800 Subject: [PATCH 11/46] Format Ratekeeper.actor.cpp code --- fdbserver/Ratekeeper.actor.cpp | 82 +++++++++++++++++----------------- 1 file changed, 41 insertions(+), 41 deletions(-) diff --git a/fdbserver/Ratekeeper.actor.cpp b/fdbserver/Ratekeeper.actor.cpp index 00ea50e850..a0ff3ffd5e 100644 --- a/fdbserver/Ratekeeper.actor.cpp +++ b/fdbserver/Ratekeeper.actor.cpp @@ -310,12 +310,12 @@ ACTOR Future monitorServerListChange( serverListAndProcessClasses = Never(); std::map newServers; - for( int i = 0; i < results.size(); i++ ) { - UID serverId = results[i].first.id(); - StorageServerInterface const& ssi = results[i].first; + for (int i = 0; i < results.size(); i++) { + const StorageServerInterface& ssi = results[i].first; + const UID serverId = ssi.id(); newServers[serverId] = ssi; - if ( oldServers.count( serverId ) ) { + if (oldServers.count(serverId)) { if (ssi.getValue.getEndpoint() != oldServers[serverId].getValue.getEndpoint()) { serverChanges.send( std::make_pair(serverId, Optional(ssi)) ); } @@ -325,7 +325,7 @@ ACTOR Future monitorServerListChange( } } - for (auto it : oldServers) { + for (const auto& it : oldServers) { serverChanges.send( std::make_pair(it.first, Optional()) ); } @@ -342,7 +342,7 @@ ACTOR Future monitorServerListChange( } } -void updateRate( RatekeeperData* self, RatekeeperLimits &limits ) { +void updateRate(RatekeeperData* self, RatekeeperLimits* limits) { //double controlFactor = ; // dt / eFoldingTime double actualTps = self->smoothReleasedTransactions.smoothRate(); @@ -350,7 +350,7 @@ void updateRate( RatekeeperData* self, RatekeeperLimits &limits ) { // SOMEDAY: Remove the max( 1.0, ... ) since the below calculations _should_ be able to recover back up from this value actualTps = std::max( std::max( 1.0, actualTps ), self->smoothTotalDurableBytes.smoothRate() / CLIENT_KNOBS->TRANSACTION_SIZE_LIMIT ); - limits.tpsLimit = std::numeric_limits::infinity(); + limits->tpsLimit = std::numeric_limits::infinity(); UID reasonID = UID(); limitReason_t limitReason = limitReason_t::unlimited; @@ -376,9 +376,9 @@ void updateRate( RatekeeperData* self, RatekeeperLimits &limits ) { worstFreeSpaceStorageServer = std::min(worstFreeSpaceStorageServer, (int64_t)ss.smoothFreeSpace.smoothTotal() - minFreeSpace); - int64_t springBytes = std::max(1, std::min(limits.storageSpringBytes, (ss.smoothFreeSpace.smoothTotal() - minFreeSpace) * 0.2)); - int64_t targetBytes = std::max(1, std::min(limits.storageTargetBytes, (int64_t)ss.smoothFreeSpace.smoothTotal() - minFreeSpace)); - if (targetBytes != limits.storageTargetBytes) { + int64_t springBytes = std::max(1, std::min(limits->storageSpringBytes, (ss.smoothFreeSpace.smoothTotal() - minFreeSpace) * 0.2)); + int64_t targetBytes = std::max(1, std::min(limits->storageTargetBytes, (int64_t)ss.smoothFreeSpace.smoothTotal() - minFreeSpace)); + if (targetBytes != limits->storageTargetBytes) { if (minFreeSpace == SERVER_KNOBS->MIN_FREE_SPACE) { ssLimitReason = limitReason_t::storage_server_min_free_space; } else { @@ -442,9 +442,9 @@ void updateRate( RatekeeperData* self, RatekeeperLimits &limits ) { storageTpsLimitReverseIndex.insert(std::make_pair(limitTps, &ss)); - if(limitTps < limits.tpsLimit && (ssLimitReason == limitReason_t::storage_server_min_free_space || ssLimitReason == limitReason_t::storage_server_min_free_space_ratio)) { + if (limitTps < limits->tpsLimit && (ssLimitReason == limitReason_t::storage_server_min_free_space || ssLimitReason == limitReason_t::storage_server_min_free_space_ratio)) { reasonID = ss.id; - limits.tpsLimit = limitTps; + limits->tpsLimit = limitTps; limitReason = ssLimitReason; } @@ -455,19 +455,19 @@ void updateRate( RatekeeperData* self, RatekeeperLimits &limits ) { self->healthMetrics.worstStorageDurabilityLag = worstStorageDurabilityLagStorageServer; std::set>> ignoredMachines; - for(auto ss = storageTpsLimitReverseIndex.begin(); ss != storageTpsLimitReverseIndex.end() && ss->first < limits.tpsLimit; ++ss) { - if(ignoredMachines.size() < std::min(self->configuration.storageTeamSize - 1, SERVER_KNOBS->MAX_MACHINES_FALLING_BEHIND)) { + for (auto ss = storageTpsLimitReverseIndex.begin(); ss != storageTpsLimitReverseIndex.end() && ss->first < limits->tpsLimit; ++ss) { + if (ignoredMachines.size() < std::min(self->configuration.storageTeamSize - 1, SERVER_KNOBS->MAX_MACHINES_FALLING_BEHIND)) { ignoredMachines.insert(ss->second->locality.zoneId()); continue; } - if(ignoredMachines.count(ss->second->locality.zoneId()) > 0) { + if (ignoredMachines.count(ss->second->locality.zoneId()) > 0) { continue; } limitingStorageQueueStorageServer = ss->second->lastReply.bytesInput - ss->second->smoothDurableBytes.smoothTotal(); - limits.tpsLimit = ss->first; - limitReason = ssReasons[storageTpsLimitReverseIndex.begin()->second->id]; + limits->tpsLimit = ss->first; reasonID = storageTpsLimitReverseIndex.begin()->second->id; // Although we aren't controlling based on the worst SS, we still report it as the limiting process + limitReason = ssReasons[reasonID]; break; } @@ -479,27 +479,27 @@ void updateRate( RatekeeperData* self, RatekeeperLimits &limits ) { { Version minSSVer = std::numeric_limits::max(); Version minLimitingSSVer = std::numeric_limits::max(); - for(auto i = self->storageQueueInfo.begin(); i != self->storageQueueInfo.end(); ++i) { - auto& ss = i->value; + for (const auto& it : self->storageQueueInfo) { + auto& ss = it.value; if (!ss.valid) continue; minSSVer = std::min(minSSVer, ss.lastReply.version); // Machines that ratekeeper isn't controlling can fall arbitrarily far behind - if(ignoredMachines.count(i->value.locality.zoneId()) == 0) { + if (ignoredMachines.count(it.value.locality.zoneId()) == 0) { minLimitingSSVer = std::min(minLimitingSSVer, ss.lastReply.version); } } Version maxTLVer = std::numeric_limits::min(); - for(auto i = self->tlogQueueInfo.begin(); i != self->tlogQueueInfo.end(); ++i) { - auto& tl = i->value; + for(const auto& it : self->tlogQueueInfo) { + auto& tl = it.value; if (!tl.valid) continue; maxTLVer = std::max(maxTLVer, tl.lastReply.v); } // writeToReadLatencyLimit: 0 = infinte speed; 1 = TL durable speed ; 2 = half TL durable speed - writeToReadLatencyLimit = ((maxTLVer - minLimitingSSVer) - limits.maxVersionDifference/2) / (limits.maxVersionDifference/4); + writeToReadLatencyLimit = ((maxTLVer - minLimitingSSVer) - limits->maxVersionDifference/2) / (limits->maxVersionDifference/4); worstVersionLag = std::max((Version)0, maxTLVer - minSSVer); limitingVersionLag = std::max((Version)0, maxTLVer - minLimitingSSVer); } @@ -507,8 +507,8 @@ void updateRate( RatekeeperData* self, RatekeeperLimits &limits ) { int64_t worstFreeSpaceTLog = std::numeric_limits::max(); int64_t worstStorageQueueTLog = 0; int tlcount = 0; - for(auto i = self->tlogQueueInfo.begin(); i != self->tlogQueueInfo.end(); ++i) { - auto& tl = i->value; + for (auto& it : self->tlogQueueInfo) { + auto& tl = it.value; if (!tl.valid) continue; ++tlcount; @@ -518,9 +518,9 @@ void updateRate( RatekeeperData* self, RatekeeperLimits &limits ) { worstFreeSpaceTLog = std::min(worstFreeSpaceTLog, (int64_t)tl.smoothFreeSpace.smoothTotal() - minFreeSpace); - int64_t springBytes = std::max(1, std::min(limits.logSpringBytes, (tl.smoothFreeSpace.smoothTotal() - minFreeSpace) * 0.2)); - int64_t targetBytes = std::max(1, std::min(limits.logTargetBytes, (int64_t)tl.smoothFreeSpace.smoothTotal() - minFreeSpace)); - if (targetBytes != limits.logTargetBytes) { + int64_t springBytes = std::max(1, std::min(limits->logSpringBytes, (tl.smoothFreeSpace.smoothTotal() - minFreeSpace) * 0.2)); + int64_t targetBytes = std::max(1, std::min(limits->logTargetBytes, (int64_t)tl.smoothFreeSpace.smoothTotal() - minFreeSpace)); + if (targetBytes != limits->logTargetBytes) { if (minFreeSpace == SERVER_KNOBS->MIN_FREE_SPACE) { tlogLimitReason = limitReason_t::log_server_min_free_space; } else { @@ -540,7 +540,7 @@ void updateRate( RatekeeperData* self, RatekeeperLimits &limits ) { } reasonID = tl.id; limitReason = limitReason_t::log_server_min_free_space; - limits.tpsLimit = 0.0; + limits->tpsLimit = 0.0; } double targetRateRatio = std::min( ( b + springBytes ) / (double)springBytes, 2.0 ); @@ -558,8 +558,8 @@ void updateRate( RatekeeperData* self, RatekeeperLimits &limits ) { if (targetRateRatio < .75) //< FIXME: KNOB for 2.0 x = std::max(x, 0.95); double lim = actualTps * x; - if (lim < limits.tpsLimit){ - limits.tpsLimit = lim; + if (lim < limits->tpsLimit){ + limits->tpsLimit = lim; reasonID = tl.id; limitReason = tlogLimitReason; } @@ -568,8 +568,8 @@ void updateRate( RatekeeperData* self, RatekeeperLimits &limits ) { // Don't let any tlogs use up its target bytes faster than its MVCC window! double x = ((targetBytes - springBytes) / ((((double)SERVER_KNOBS->MAX_READ_TRANSACTION_LIFE_VERSIONS)/SERVER_KNOBS->VERSIONS_PER_SECOND) + 2.0)) / inputRate; double lim = actualTps * x; - if (lim < limits.tpsLimit){ - limits.tpsLimit = lim; + if (lim < limits->tpsLimit){ + limits->tpsLimit = lim; reasonID = tl.id; limitReason = limitReason_t::log_server_mvcc_write_bandwidth; } @@ -578,10 +578,10 @@ void updateRate( RatekeeperData* self, RatekeeperLimits &limits ) { self->healthMetrics.worstTLogQueue = worstStorageQueueTLog; - limits.tpsLimit = std::max(limits.tpsLimit, 0.0); + limits->tpsLimit = std::max(limits->tpsLimit, 0.0); if(g_network->isSimulated() && g_simulator.speedUpSimulation) { - limits.tpsLimit = std::max(limits.tpsLimit, 100.0); + limits->tpsLimit = std::max(limits->tpsLimit, 100.0); } int64_t totalDiskUsageBytes = 0; @@ -592,13 +592,13 @@ void updateRate( RatekeeperData* self, RatekeeperLimits &limits ) { if (s.value.valid) totalDiskUsageBytes += s.value.lastReply.storageBytes.used; - limits.tpsLimitMetric = std::min(limits.tpsLimit, 1e6); - limits.reasonMetric = limitReason; + limits->tpsLimitMetric = std::min(limits->tpsLimit, 1e6); + limits->reasonMetric = limitReason; if (g_random->random01() < 0.1) { - std::string name = "RkUpdate" + limits.context; + std::string name = "RkUpdate" + limits->context; TraceEvent(name.c_str()) - .detail("TPSLimit", limits.tpsLimit) + .detail("TPSLimit", limits->tpsLimit) .detail("Reason", limitReason) .detail("ReasonServerID", reasonID) .detail("ReleasedTPS", self->smoothReleasedTransactions.smoothRate()) @@ -673,8 +673,8 @@ ACTOR Future rateKeeper(RatekeeperInterface rkInterf, Reference Date: Thu, 7 Mar 2019 10:15:28 -0800 Subject: [PATCH 12/46] Data distributor pulls batch limited info from proxy Add a flag in HealthMetrics to indicate that batch priority is rate limited. Data distributor pulls this flag from proxy to know roughly when rate limiting happens. DD uses this information to determine when to do the rebalance in the background, i.e., moving data from heavily loaded servers to lighter ones. If the cluster is currently rate limited for batch commits, then the rebalance will use longer time intervals, otherwise use shorter intervals. See BgDDMountainChopper() and BgDDValleyFiller() in DataDistributionQueue.actor.cpp. --- fdbclient/FDBTypes.h | 8 +++++-- fdbserver/DataDistribution.actor.cpp | 28 ++++++++++++++++++++--- fdbserver/DataDistribution.actor.h | 3 ++- fdbserver/DataDistributionQueue.actor.cpp | 6 ++--- fdbserver/Ratekeeper.actor.cpp | 3 +++ 5 files changed, 39 insertions(+), 9 deletions(-) diff --git a/fdbclient/FDBTypes.h b/fdbclient/FDBTypes.h index 5b7de5e818..766d39831c 100644 --- a/fdbclient/FDBTypes.h +++ b/fdbclient/FDBTypes.h @@ -737,6 +737,7 @@ struct HealthMetrics { int64_t worstStorageDurabilityLag; int64_t worstTLogQueue; double tpsLimit; + bool batchLimited; std::map storageStats; std::map tLogQueue; @@ -745,6 +746,7 @@ struct HealthMetrics { , worstStorageDurabilityLag(0) , worstTLogQueue(0) , tpsLimit(0.0) + , batchLimited(false) {} void update(const HealthMetrics& hm, bool detailedInput, bool detailedOutput) @@ -753,6 +755,7 @@ struct HealthMetrics { worstStorageDurabilityLag = hm.worstStorageDurabilityLag; worstTLogQueue = hm.worstTLogQueue; tpsLimit = hm.tpsLimit; + batchLimited = hm.batchLimited; if (!detailedOutput) { storageStats.clear(); @@ -769,13 +772,14 @@ struct HealthMetrics { worstStorageDurabilityLag == r.worstStorageDurabilityLag && worstTLogQueue == r.worstTLogQueue && storageStats == r.storageStats && - tLogQueue == r.tLogQueue + tLogQueue == r.tLogQueue && + batchLimited == r.batchLimited ); } template void serialize(Ar& ar) { - serializer(ar, worstStorageQueue, worstStorageDurabilityLag, worstTLogQueue, tpsLimit, storageStats, tLogQueue); + serializer(ar, worstStorageQueue, worstStorageDurabilityLag, worstTLogQueue, tpsLimit, batchLimited, storageStats, tLogQueue); } }; diff --git a/fdbserver/DataDistribution.actor.cpp b/fdbserver/DataDistribution.actor.cpp index d664e07249..e6af98ce49 100644 --- a/fdbserver/DataDistribution.actor.cpp +++ b/fdbserver/DataDistribution.actor.cpp @@ -3455,7 +3455,7 @@ struct DataDistributorData : NonCopyable, ReferenceCounted DataDistributorData(Reference> const& db, UID id) : dbInfo(db), ddId(id) {} }; -ACTOR Future dataDistribution(Reference self) +ACTOR Future dataDistribution(Reference self, double* lastLimited) { state Database cx = openDBOnServer(self->dbInfo, TaskDataDistributionLaunch, true, true); cx->locationCacheSize = SERVER_KNOBS->DD_LOCATION_CACHE_SIZE; @@ -3612,7 +3612,7 @@ ACTOR Future dataDistribution(Reference self) actors.push_back( pollMoveKeysLock(cx, lock) ); actors.push_back( reportErrorsExcept( dataDistributionTracker( initData, cx, output, shardsAffectedByTeamFailure, getShardMetrics, getAverageShardBytes.getFuture(), readyToStart, anyZeroHealthyTeams, self->ddId ), "DDTracker", self->ddId, &normalDDQueueErrors() ) ); - actors.push_back( reportErrorsExcept( dataDistributionQueue( cx, output, input.getFuture(), getShardMetrics, processingUnhealthy, tcis, shardsAffectedByTeamFailure, lock, getAverageShardBytes, self->ddId, storageTeamSize ), "DDQueue", self->ddId, &normalDDQueueErrors() ) ); + actors.push_back( reportErrorsExcept( dataDistributionQueue( cx, output, input.getFuture(), getShardMetrics, processingUnhealthy, tcis, shardsAffectedByTeamFailure, lock, getAverageShardBytes, self->ddId, storageTeamSize, lastLimited ), "DDQueue", self->ddId, &normalDDQueueErrors() ) ); vector teamCollectionsPtrs; Reference primaryTeamCollection( new DDTeamCollection(cx, self->ddId, lock, output, shardsAffectedByTeamFailure, configuration, primaryDcId, configuration.usableRegions > 1 ? remoteDcIds : std::vector>(), readyToStart.getFuture(), zeroHealthyTeams[0], true, processingUnhealthy) ); @@ -3654,14 +3654,36 @@ static std::set const& normalDataDistributorErrors() { return s; } +ACTOR Future monitorBatchLimitedTime(Reference> db, double* lastLimited) { + loop { + wait( delay(SERVER_KNOBS->METRIC_UPDATE_RATE) ); + while (db->get().client.proxies.size() == 0) { + wait(db->onChange()); + } + + state int idx = g_random->randomInt(0, db->get().client.proxies.size()); + choose { + when (wait(db->onChange())) {} + when (ErrorOr reply = wait( + db->get().client.proxies[idx].getHealthMetrics.getReplyUnlessFailedFor(GetHealthMetricsRequest(false), 1.0, 0))) { + if (reply.present() && reply.get().healthMetrics.batchLimited) { + *lastLimited = now(); + } + } + } + } +} + ACTOR Future dataDistributor(DataDistributorInterface di, Reference> db ) { state Reference self( new DataDistributorData(db, di.id()) ); state Future collection = actorCollection( self->addActor.getFuture() ); + state double lastLimited = 0; try { TraceEvent("DataDistributor_Running", di.id()); self->addActor.send( waitFailureServer(di.waitFailure.getFuture()) ); - state Future distributor = reportErrorsExcept( dataDistribution(self), "DataDistribution", di.id(), &normalDataDistributorErrors() ); + self->addActor.send( monitorBatchLimitedTime(db, &lastLimited) ); + state Future distributor = reportErrorsExcept( dataDistribution(self, &lastLimited), "DataDistribution", di.id(), &normalDataDistributorErrors() ); wait( distributor || collection ); } diff --git a/fdbserver/DataDistribution.actor.h b/fdbserver/DataDistribution.actor.h index 109c57878b..1baff7f58e 100644 --- a/fdbserver/DataDistribution.actor.h +++ b/fdbserver/DataDistribution.actor.h @@ -230,7 +230,8 @@ Future dataDistributionQueue( MoveKeysLock const& lock, PromiseStream> const& getAverageShardBytes, UID const& distributorId, - int const& teamSize); + int const& teamSize, + double* const& lastLimited); //Holds the permitted size and IO Bounds for a shard struct ShardSizeBounds { diff --git a/fdbserver/DataDistributionQueue.actor.cpp b/fdbserver/DataDistributionQueue.actor.cpp index f0b8d28d41..f282ef12bc 100644 --- a/fdbserver/DataDistributionQueue.actor.cpp +++ b/fdbserver/DataDistributionQueue.actor.cpp @@ -1201,10 +1201,10 @@ ACTOR Future dataDistributionQueue( MoveKeysLock lock, PromiseStream> getAverageShardBytes, UID distributorId, - int teamSize) + int teamSize, + double* lastLimited) { - state double lastLimited = 0; - state DDQueueData self( distributorId, lock, cx, teamCollections, shardsAffectedByTeamFailure, getAverageShardBytes, teamSize, output, input, getShardMetrics, &lastLimited ); + state DDQueueData self( distributorId, lock, cx, teamCollections, shardsAffectedByTeamFailure, getAverageShardBytes, teamSize, output, input, getShardMetrics, lastLimited ); state std::set serversToLaunchFrom; state KeyRange keysToLaunchFrom; state RelocateData launchData; diff --git a/fdbserver/Ratekeeper.actor.cpp b/fdbserver/Ratekeeper.actor.cpp index a0ff3ffd5e..46dcfe25f0 100644 --- a/fdbserver/Ratekeeper.actor.cpp +++ b/fdbserver/Ratekeeper.actor.cpp @@ -671,11 +671,13 @@ ACTOR Future rateKeeper(RatekeeperInterface rkInterf, Reference SERVER_KNOBS->LAST_LIMITED_RATIO * self.batchLimits.tpsLimit; double tooOld = now() - 1.0; for(auto p=self.proxy_transactionCounts.begin(); p!=self.proxy_transactionCounts.end(); ) { if (p->second.time < tooOld) @@ -707,6 +709,7 @@ ACTOR Future rateKeeper(RatekeeperInterface rkInterf, Reference Date: Fri, 8 Mar 2019 11:25:07 -0500 Subject: [PATCH 13/46] replaced std::pair with a struct named WorkerDetails --- fdbserver/ClusterController.actor.cpp | 266 +++++++++--------- fdbserver/ClusterRecruitmentInterface.h | 2 +- fdbserver/QuietDatabase.actor.cpp | 30 +- fdbserver/QuietDatabase.h | 2 +- fdbserver/Status.actor.cpp | 114 ++++---- fdbserver/Status.h | 2 +- fdbserver/WorkerInterface.actor.h | 14 + fdbserver/tester.actor.cpp | 6 +- .../workloads/ConsistencyCheck.actor.cpp | 87 +++--- fdbserver/workloads/CpuProfiler.actor.cpp | 6 +- fdbserver/workloads/LogMetrics.actor.cpp | 4 +- fdbserver/workloads/Performance.actor.cpp | 6 +- fdbserver/workloads/Ping.actor.cpp | 8 +- fdbserver/workloads/ReadWrite.actor.cpp | 4 +- fdbserver/workloads/TargetedKill.actor.cpp | 10 +- fdbserver/workloads/WorkerErrors.actor.cpp | 6 +- 16 files changed, 289 insertions(+), 278 deletions(-) diff --git a/fdbserver/ClusterController.actor.cpp b/fdbserver/ClusterController.actor.cpp index 53962c0340..938a5fc6b3 100644 --- a/fdbserver/ClusterController.actor.cpp +++ b/fdbserver/ClusterController.actor.cpp @@ -51,37 +51,35 @@ struct WorkerInfo : NonCopyable { Generation gen; int reboots; double lastAvailableTime; - WorkerInterface interf; ProcessClass initialClass; - ProcessClass processClass; ClusterControllerPriorityInfo priorityInfo; + WorkerDetails details; WorkerInfo() : gen(-1), reboots(0), lastAvailableTime(now()), priorityInfo(ProcessClass::UnsetFit, false, ClusterControllerPriorityInfo::FitnessUnknown) {} - WorkerInfo( Future watcher, ReplyPromise reply, Generation gen, WorkerInterface interf, ProcessClass initialClass, ProcessClass processClass, ClusterControllerPriorityInfo priorityInfo ) : - watcher(watcher), reply(reply), gen(gen), reboots(0), lastAvailableTime(now()), interf(interf), initialClass(initialClass), processClass(processClass), priorityInfo(priorityInfo) {} + WorkerInfo( Future watcher, ReplyPromise reply, Generation gen, WorkerInterface interf, ProcessClass initialClass, ProcessClass processClass, ClusterControllerPriorityInfo priorityInfo, bool degraded ) : + watcher(watcher), reply(reply), gen(gen), reboots(0), lastAvailableTime(now()), initialClass(initialClass), priorityInfo(priorityInfo), details(interf, processClass, degraded) {} WorkerInfo( WorkerInfo&& r ) noexcept(true) : watcher(std::move(r.watcher)), reply(std::move(r.reply)), gen(r.gen), - reboots(r.reboots), lastAvailableTime(r.lastAvailableTime), interf(std::move(r.interf)), initialClass(r.initialClass), processClass(r.processClass), priorityInfo(r.priorityInfo) {} + reboots(r.reboots), lastAvailableTime(r.lastAvailableTime), initialClass(r.initialClass), priorityInfo(r.priorityInfo), details(std::move(r.details)) {} void operator=( WorkerInfo&& r ) noexcept(true) { watcher = std::move(r.watcher); reply = std::move(r.reply); gen = r.gen; reboots = r.reboots; lastAvailableTime = r.lastAvailableTime; - interf = std::move(r.interf); initialClass = r.initialClass; - processClass = r.processClass; priorityInfo = r.priorityInfo; + details = std::move(r.details); } }; struct WorkerFitnessInfo { - std::pair worker; + WorkerDetails worker; ProcessClass::Fitness fitness; int used; WorkerFitnessInfo() : fitness(ProcessClass::NeverAssign), used(0) {} - WorkerFitnessInfo(std::pair worker, ProcessClass::Fitness fitness, int used) : worker(worker), fitness(fitness), used(used) {} + WorkerFitnessInfo(WorkerDetails worker, ProcessClass::Fitness fitness, int used) : worker(worker), fitness(fitness), used(used) {} }; class ClusterControllerData { @@ -184,35 +182,35 @@ public: }; bool workerAvailable( WorkerInfo const& worker, bool checkStable ) { - return ( now() - startTime < 2 * FLOW_KNOBS->SERVER_REQUEST_INTERVAL ) || ( IFailureMonitor::failureMonitor().getState(worker.interf.storage.getEndpoint()).isAvailable() && ( !checkStable || worker.reboots < 2 ) ); + return ( now() - startTime < 2 * FLOW_KNOBS->SERVER_REQUEST_INTERVAL ) || ( IFailureMonitor::failureMonitor().getState(worker.details.interf.storage.getEndpoint()).isAvailable() && ( !checkStable || worker.reboots < 2 ) ); } - std::pair getStorageWorker( RecruitStorageRequest const& req ) { + WorkerDetails getStorageWorker( RecruitStorageRequest const& req ) { std::set>> excludedMachines( req.excludeMachines.begin(), req.excludeMachines.end() ); std::set>> includeDCs( req.includeDCs.begin(), req.includeDCs.end() ); std::set excludedAddresses( req.excludeAddresses.begin(), req.excludeAddresses.end() ); for( auto& it : id_worker ) if( workerAvailable( it.second, false ) && - !excludedMachines.count(it.second.interf.locality.zoneId()) && - ( includeDCs.size() == 0 || includeDCs.count(it.second.interf.locality.dcId()) ) && - !addressExcluded(excludedAddresses, it.second.interf.address()) && - it.second.processClass.machineClassFitness( ProcessClass::Storage ) <= ProcessClass::UnsetFit ) { - return std::make_pair(it.second.interf, it.second.processClass); + !excludedMachines.count(it.second.details.interf.locality.zoneId()) && + ( includeDCs.size() == 0 || includeDCs.count(it.second.details.interf.locality.dcId()) ) && + !addressExcluded(excludedAddresses, it.second.details.interf.address()) && + it.second.details.processClass.machineClassFitness( ProcessClass::Storage ) <= ProcessClass::UnsetFit ) { + return it.second.details; } if( req.criticalRecruitment ) { ProcessClass::Fitness bestFit = ProcessClass::NeverAssign; - Optional> bestInfo; + Optional bestInfo; for( auto& it : id_worker ) { - ProcessClass::Fitness fit = it.second.processClass.machineClassFitness( ProcessClass::Storage ); + ProcessClass::Fitness fit = it.second.details.processClass.machineClassFitness( ProcessClass::Storage ); if( workerAvailable( it.second, false ) && - !excludedMachines.count(it.second.interf.locality.zoneId()) && - ( includeDCs.size() == 0 || includeDCs.count(it.second.interf.locality.dcId()) ) && - !addressExcluded(excludedAddresses, it.second.interf.address()) && + !excludedMachines.count(it.second.details.interf.locality.zoneId()) && + ( includeDCs.size() == 0 || includeDCs.count(it.second.details.interf.locality.dcId()) ) && + !addressExcluded(excludedAddresses, it.second.details.interf.address()) && fit < bestFit ) { bestFit = fit; - bestInfo = std::make_pair(it.second.interf, it.second.processClass); + bestInfo = it.second.details; } } @@ -224,23 +222,23 @@ public: throw no_more_servers(); } - std::vector> getWorkersForSeedServers( DatabaseConfiguration const& conf, IRepPolicyRef const& policy, Optional>> const& dcId = Optional>>() ) { - std::map>> fitness_workers; - std::vector> results; - LocalitySetRef logServerSet = Reference(new LocalityMap>()); - LocalityMap>* logServerMap = (LocalityMap>*) logServerSet.getPtr(); + std::vector getWorkersForSeedServers( DatabaseConfiguration const& conf, IRepPolicyRef const& policy, Optional>> const& dcId = Optional>>() ) { + std::map> fitness_workers; + std::vector results; + LocalitySetRef logServerSet = Reference(new LocalityMap()); + LocalityMap* logServerMap = (LocalityMap*) logServerSet.getPtr(); bool bCompleted = false; for( auto& it : id_worker ) { - auto fitness = it.second.processClass.machineClassFitness( ProcessClass::Storage ); - if( workerAvailable(it.second, false) && !conf.isExcludedServer(it.second.interf.address()) && fitness != ProcessClass::NeverAssign && ( !dcId.present() || it.second.interf.locality.dcId()==dcId.get() ) ) { - fitness_workers[ fitness ].push_back(std::make_pair(it.second.interf, it.second.processClass)); + auto fitness = it.second.details.processClass.machineClassFitness( ProcessClass::Storage ); + if( workerAvailable(it.second, false) && !conf.isExcludedServer(it.second.details.interf.address()) && fitness != ProcessClass::NeverAssign && ( !dcId.present() || it.second.details.interf.locality.dcId()==dcId.get() ) ) { + fitness_workers[ fitness ].push_back(it.second.details); } } for( auto& it : fitness_workers ) { for (auto& worker : it.second ) { - logServerMap->add(worker.first.locality, &worker); + logServerMap->add(worker.interf.locality, &worker); } std::vector bestSet; @@ -265,24 +263,24 @@ public: return results; } - std::vector> getWorkersForTlogs( DatabaseConfiguration const& conf, int32_t required, int32_t desired, IRepPolicyRef const& policy, std::map< Optional>, int>& id_used, bool checkStable = false, std::set> dcIds = std::set>() ) { - std::map>> fitness_workers; - std::vector> results; + std::vector getWorkersForTlogs( DatabaseConfiguration const& conf, int32_t required, int32_t desired, IRepPolicyRef const& policy, std::map< Optional>, int>& id_used, bool checkStable = false, std::set> dcIds = std::set>() ) { + std::map> fitness_workers; + std::vector results; std::vector unavailableLocals; LocalitySetRef logServerSet; - LocalityMap>* logServerMap; + LocalityMap* logServerMap; bool bCompleted = false; - logServerSet = Reference(new LocalityMap>()); - logServerMap = (LocalityMap>*) logServerSet.getPtr(); + logServerSet = Reference(new LocalityMap()); + logServerMap = (LocalityMap*) logServerSet.getPtr(); for( auto& it : id_worker ) { - auto fitness = it.second.processClass.machineClassFitness( ProcessClass::TLog ); - if( workerAvailable(it.second, checkStable) && !conf.isExcludedServer(it.second.interf.address()) && fitness != ProcessClass::NeverAssign && (!dcIds.size() || dcIds.count(it.second.interf.locality.dcId())) ) { - fitness_workers[ fitness ].push_back(std::make_pair(it.second.interf, it.second.processClass)); + auto fitness = it.second.details.processClass.machineClassFitness( ProcessClass::TLog ); + if( workerAvailable(it.second, checkStable) && !conf.isExcludedServer(it.second.details.interf.address()) && fitness != ProcessClass::NeverAssign && (!dcIds.size() || dcIds.count(it.second.details.interf.locality.dcId())) ) { + fitness_workers[ fitness ].push_back(it.second.details); } else { - unavailableLocals.push_back(it.second.interf.locality); + unavailableLocals.push_back(it.second.details.interf.locality); } } @@ -293,7 +291,7 @@ public: if (fitness_workers.find(fitnessEnum) == fitness_workers.end()) continue; for (auto& worker : fitness_workers[(ProcessClass::Fitness) fitness] ) { - logServerMap->add(worker.first.locality, &worker); + logServerMap->add(worker.interf.locality, &worker); } if (logServerSet->size() < required) { TraceEvent(SevWarn,"GWFTADTooFew", id).detail("Fitness", fitness).detail("Processes", logServerSet->size()).detail("Required", required).detail("TLogPolicy", policy->info()).detail("DesiredLogs", desired); @@ -320,7 +318,7 @@ public: auto object = logServerMap->getObject(entry); ASSERT(object); results.push_back(*object); - tLocalities.push_back(object->first.locality); + tLocalities.push_back(object->interf.locality); } TraceEvent("GWFTADBestResults", id).detail("Fitness", fitness).detail("Processes", logServerSet->size()).detail("BestCount", bestSet.size()).detail("BestZones", ::describeZones(tLocalities)) .detail("BestDataHalls", ::describeDataHalls(tLocalities)).detail("TLogPolicy", policy->info()).detail("TotalResults", results.size()).detail("DesiredLogs", desired); @@ -335,7 +333,7 @@ public: if (!bCompleted) { std::vector tLocalities; for (auto& object : logServerMap->getObjects()) { - tLocalities.push_back(object->first.locality); + tLocalities.push_back(object->interf.locality); } TraceEvent(SevWarn, "GetTLogTeamFailed").detail("Policy", policy->info()).detail("Processes", logServerSet->size()).detail("Workers", id_worker.size()).detail("FitnessGroups", fitness_workers.size()) @@ -349,7 +347,7 @@ public: } for (auto& result : results) { - id_used[result.first.locality.processId()]++; + id_used[result.interf.locality.processId()]++; } TraceEvent("GetTLogTeamDone").detail("Completed", bCompleted).detail("Policy", policy->info()).detail("Results", results.size()).detail("Processes", logServerSet->size()).detail("Workers", id_worker.size()) @@ -362,7 +360,7 @@ public: } //FIXME: This logic will fallback unnecessarily when usable dcs > 1 because it does not check all combinations of potential satellite locations - std::vector> getWorkersForSatelliteLogs( const DatabaseConfiguration& conf, const RegionInfo& region, std::map< Optional>, int>& id_used, bool& satelliteFallback, bool checkStable = false ) { + std::vector getWorkersForSatelliteLogs( const DatabaseConfiguration& conf, const RegionInfo& region, std::map< Optional>, int>& id_used, bool& satelliteFallback, bool checkStable = false ) { int startDC = 0; loop { if(startDC > 0 && startDC >= region.satellites.size() + 1 - (satelliteFallback ? region.satelliteTLogUsableDcsFallback : region.satelliteTLogUsableDcs)) { @@ -399,15 +397,15 @@ public: } WorkerFitnessInfo getWorkerForRoleInDatacenter(Optional> const& dcId, ProcessClass::ClusterRole role, ProcessClass::Fitness unacceptableFitness, DatabaseConfiguration const& conf, std::map< Optional>, int>& id_used, bool checkStable = false ) { - std::map, vector>> fitness_workers; + std::map, vector> fitness_workers; for( auto& it : id_worker ) { - auto fitness = it.second.processClass.machineClassFitness( role ); - if(conf.isExcludedServer(it.second.interf.address())) { + auto fitness = it.second.details.processClass.machineClassFitness( role ); + if(conf.isExcludedServer(it.second.details.interf.address())) { fitness = std::max(fitness, ProcessClass::ExcludeFit); } - if( workerAvailable(it.second, checkStable) && fitness < unacceptableFitness && it.second.interf.locality.dcId()==dcId ) { - fitness_workers[ std::make_pair(fitness, id_used[it.first]) ].push_back(std::make_pair(it.second.interf, it.second.processClass)); + if( workerAvailable(it.second, checkStable) && fitness < unacceptableFitness && it.second.details.interf.locality.dcId()==dcId ) { + fitness_workers[ std::make_pair(fitness, id_used[it.first]) ].push_back(it.second.details); } } @@ -415,7 +413,7 @@ public: auto& w = it.second; g_random->randomShuffle(w); for( int i=0; i < w.size(); i++ ) { - id_used[w[i].first.locality.processId()]++; + id_used[w[i].interf.locality.processId()]++; return WorkerFitnessInfo(w[i], it.first.first, it.first.second); } } @@ -423,17 +421,17 @@ public: throw no_more_servers(); } - vector> getWorkersForRoleInDatacenter(Optional> const& dcId, ProcessClass::ClusterRole role, int amount, DatabaseConfiguration const& conf, std::map< Optional>, int>& id_used, Optional minWorker = Optional(), bool checkStable = false ) { - std::map, vector>> fitness_workers; - vector> results; + vector getWorkersForRoleInDatacenter(Optional> const& dcId, ProcessClass::ClusterRole role, int amount, DatabaseConfiguration const& conf, std::map< Optional>, int>& id_used, Optional minWorker = Optional(), bool checkStable = false ) { + std::map, vector> fitness_workers; + vector results; if (amount <= 0) return results; for( auto& it : id_worker ) { - auto fitness = it.second.processClass.machineClassFitness( role ); - if( workerAvailable(it.second, checkStable) && !conf.isExcludedServer(it.second.interf.address()) && it.second.interf.locality.dcId() == dcId && - ( !minWorker.present() || ( it.second.interf.id() != minWorker.get().worker.first.id() && ( fitness < minWorker.get().fitness || (fitness == minWorker.get().fitness && id_used[it.first] <= minWorker.get().used ) ) ) ) ) { - fitness_workers[ std::make_pair(fitness, id_used[it.first]) ].push_back(std::make_pair(it.second.interf, it.second.processClass)); + auto fitness = it.second.details.processClass.machineClassFitness( role ); + if( workerAvailable(it.second, checkStable) && !conf.isExcludedServer(it.second.details.interf.address()) && it.second.details.interf.locality.dcId() == dcId && + ( !minWorker.present() || ( it.second.details.interf.id() != minWorker.get().worker.interf.id() && ( fitness < minWorker.get().fitness || (fitness == minWorker.get().fitness && id_used[it.first] <= minWorker.get().used ) ) ) ) ) { + fitness_workers[ std::make_pair(fitness, id_used[it.first]) ].push_back(it.second.details); } } @@ -442,7 +440,7 @@ public: g_random->randomShuffle(w); for( int i=0; i < w.size(); i++ ) { results.push_back(w[i]); - id_used[w[i].first.locality.processId()]++; + id_used[w[i].interf.locality.processId()]++; if( results.size() == amount ) return results; } @@ -465,11 +463,11 @@ public: RoleFitness(RoleFitness first, RoleFitness second, ProcessClass::ClusterRole role) : bestFit(std::min(first.worstFit, second.worstFit)), worstFit(std::max(first.worstFit, second.worstFit)), count(first.count + second.count), role(role) { } - RoleFitness( vector> workers, ProcessClass::ClusterRole role ) : role(role) { + RoleFitness( vector workers, ProcessClass::ClusterRole role ) : role(role) { worstFit = ProcessClass::BestFit; bestFit = ProcessClass::NeverAssign; for(auto it : workers) { - auto thisFit = it.second.machineClassFitness( role ); + auto thisFit = it.processClass.machineClassFitness( role ); worstFit = std::max(worstFit, thisFit); bestFit = std::min(bestFit, thisFit); } @@ -513,8 +511,8 @@ public: std::set>> getDatacenters( DatabaseConfiguration const& conf, bool checkStable = false ) { std::set>> result; for( auto& it : id_worker ) - if( workerAvailable( it.second, checkStable ) && !conf.isExcludedServer( it.second.interf.address() ) ) - result.insert(it.second.interf.locality.dcId()); + if( workerAvailable( it.second, checkStable ) && !conf.isExcludedServer( it.second.details.interf.address() ) ) + result.insert(it.second.details.interf.locality.dcId()); return result; } @@ -537,12 +535,12 @@ public: auto remoteLogs = getWorkersForTlogs( req.configuration, req.configuration.getRemoteTLogReplicationFactor(), req.configuration.getDesiredRemoteLogs(), req.configuration.getRemoteTLogPolicy(), id_used, false, remoteDC ); for(int i = 0; i < remoteLogs.size(); i++) { - result.remoteTLogs.push_back(remoteLogs[i].first); + result.remoteTLogs.push_back(remoteLogs[i].interf); } auto logRouters = getWorkersForRoleInDatacenter( req.dcId, ProcessClass::LogRouter, req.logRouterCount, req.configuration, id_used ); for(int i = 0; i < logRouters.size(); i++) { - result.logRouters.push_back(logRouters[i].first); + result.logRouters.push_back(logRouters[i].interf); } if(!remoteStartTime.present()) { @@ -587,20 +585,20 @@ public: if(req.recruitSeedServers) { auto primaryStorageServers = getWorkersForSeedServers( req.configuration, req.configuration.storagePolicy, dcId ); for(int i = 0; i < primaryStorageServers.size(); i++) { - result.storageServers.push_back(primaryStorageServers[i].first); + result.storageServers.push_back(primaryStorageServers[i].interf); } } auto tlogs = getWorkersForTlogs( req.configuration, req.configuration.tLogReplicationFactor, req.configuration.getDesiredLogs(), req.configuration.tLogPolicy, id_used, false, primaryDC ); for(int i = 0; i < tlogs.size(); i++) { - result.tLogs.push_back(tlogs[i].first); + result.tLogs.push_back(tlogs[i].interf); } - std::vector> satelliteLogs; + std::vector satelliteLogs; if(region.satelliteTLogReplicationFactor > 0) { satelliteLogs = getWorkersForSatelliteLogs( req.configuration, region, id_used, result.satelliteFallback ); for(int i = 0; i < satelliteLogs.size(); i++) { - result.satelliteTLogs.push_back(satelliteLogs[i].first); + result.satelliteTLogs.push_back(satelliteLogs[i].interf); } } @@ -614,13 +612,13 @@ public: resolvers.push_back(first_resolver.worker); for(int i = 0; i < resolvers.size(); i++) - result.resolvers.push_back(resolvers[i].first); + result.resolvers.push_back(resolvers[i].interf); for(int i = 0; i < proxies.size(); i++) - result.proxies.push_back(proxies[i].first); + result.proxies.push_back(proxies[i].interf); auto oldLogRouters = getWorkersForRoleInDatacenter( dcId, ProcessClass::LogRouter, req.maxOldLogRouters, req.configuration, id_used ); for(int i = 0; i < oldLogRouters.size(); i++) { - result.oldLogRouters.push_back(oldLogRouters[i].first); + result.oldLogRouters.push_back(oldLogRouters[i].interf); } if( now() - startTime < SERVER_KNOBS->WAIT_FOR_GOOD_RECRUITMENT_DELAY && @@ -699,13 +697,13 @@ public: updateKnownIds(&id_used); auto tlogs = getWorkersForTlogs( req.configuration, req.configuration.tLogReplicationFactor, req.configuration.getDesiredLogs(), req.configuration.tLogPolicy, id_used ); for(int i = 0; i < tlogs.size(); i++) { - result.tLogs.push_back(tlogs[i].first); + result.tLogs.push_back(tlogs[i].interf); } if(req.recruitSeedServers) { auto primaryStorageServers = getWorkersForSeedServers( req.configuration, req.configuration.storagePolicy ); for(int i = 0; i < primaryStorageServers.size(); i++) - result.storageServers.push_back(primaryStorageServers[i].first); + result.storageServers.push_back(primaryStorageServers[i].interf); } auto datacenters = getDatacenters( req.configuration ); @@ -733,13 +731,13 @@ public: bestFitness = fitness; bestDC = dcId; for(int i = 0; i < resolvers.size(); i++) - result.resolvers.push_back(resolvers[i].first); + result.resolvers.push_back(resolvers[i].interf); for(int i = 0; i < proxies.size(); i++) - result.proxies.push_back(proxies[i].first); + result.proxies.push_back(proxies[i].interf); auto oldLogRouters = getWorkersForRoleInDatacenter( dcId, ProcessClass::LogRouter, req.maxOldLogRouters, req.configuration, used ); for(int i = 0; i < oldLogRouters.size(); i++) { - result.oldLogRouters.push_back(oldLogRouters[i].first); + result.oldLogRouters.push_back(oldLogRouters[i].interf); } break; } else { @@ -851,10 +849,10 @@ public: } // Get tlog processes - std::vector> tlogs; - std::vector> remote_tlogs; - std::vector> satellite_tlogs; - std::vector> log_routers; + std::vector tlogs; + std::vector remote_tlogs; + std::vector satellite_tlogs; + std::vector log_routers; std::set logRouterAddresses; for( auto& logSet : dbi.logSystemConfig.tLogs ) { @@ -866,12 +864,12 @@ public: return true; if(logSet.isLocal && logSet.locality == tagLocalitySatellite) { - satellite_tlogs.push_back(std::make_pair(tlogWorker->second.interf, tlogWorker->second.processClass)); + satellite_tlogs.push_back(tlogWorker->second.details); } else if(logSet.isLocal) { - tlogs.push_back(std::make_pair(tlogWorker->second.interf, tlogWorker->second.processClass)); + tlogs.push_back(tlogWorker->second.details); } else { - remote_tlogs.push_back(std::make_pair(tlogWorker->second.interf, tlogWorker->second.processClass)); + remote_tlogs.push_back(tlogWorker->second.details); } } @@ -881,37 +879,37 @@ public: return false; if ( tlogWorker->second.priorityInfo.isExcluded ) return true; - if( !logRouterAddresses.count( tlogWorker->second.interf.address() ) ) { - logRouterAddresses.insert( tlogWorker->second.interf.address() ); - log_routers.push_back(std::make_pair(tlogWorker->second.interf, tlogWorker->second.processClass)); + if( !logRouterAddresses.count( tlogWorker->second.details.interf.address() ) ) { + logRouterAddresses.insert( tlogWorker->second.details.interf.address() ); + log_routers.push_back(tlogWorker->second.details); } } } // Get proxy classes - std::vector proxyClasses; + std::vector proxyClasses; for(auto& it : dbi.client.proxies ) { auto proxyWorker = id_worker.find(it.locality.processId()); if ( proxyWorker == id_worker.end() ) return false; if ( proxyWorker->second.priorityInfo.isExcluded ) return true; - proxyClasses.push_back(proxyWorker->second.processClass); + proxyClasses.push_back(proxyWorker->second.details); } // Get resolver classes - std::vector resolverClasses; + std::vector resolverClasses; for(auto& it : dbi.resolvers ) { auto resolverWorker = id_worker.find(it.locality.processId()); if ( resolverWorker == id_worker.end() ) return false; if ( resolverWorker->second.priorityInfo.isExcluded ) return true; - resolverClasses.push_back(resolverWorker->second.processClass); + resolverClasses.push_back(resolverWorker->second.details); } // Check master fitness. Don't return false if master is excluded in case all the processes are excluded, we still need master for recovery. - ProcessClass::Fitness oldMasterFit = masterWorker->second.processClass.machineClassFitness( ProcessClass::Master ); + ProcessClass::Fitness oldMasterFit = masterWorker->second.details.processClass.machineClassFitness( ProcessClass::Master ); if(db.config.isExcludedServer(dbi.master.address())) { oldMasterFit = std::max(oldMasterFit, ProcessClass::ExcludeFit); } @@ -925,7 +923,7 @@ public: if ( oldMasterFit < mworker.fitness ) return false; - if ( oldMasterFit > mworker.fitness || ( dbi.master.locality.processId() == clusterControllerProcessId && mworker.worker.first.locality.processId() != clusterControllerProcessId ) ) + if ( oldMasterFit > mworker.fitness || ( dbi.master.locality.processId() == clusterControllerProcessId && mworker.worker.interf.locality.processId() != clusterControllerProcessId ) ) return true; std::set> primaryDC; @@ -1107,9 +1105,9 @@ ACTOR Future clusterWatchDatabase( ClusterControllerData* cluster, Cluster id_used[cluster->db.serverInfo->get().distributor.get().locality.processId()]++; } state WorkerFitnessInfo masterWorker = cluster->getWorkerForRoleInDatacenter(cluster->clusterControllerDcId, ProcessClass::Master, ProcessClass::NeverAssign, db->config, id_used); - if( ( masterWorker.worker.second.machineClassFitness( ProcessClass::Master ) > SERVER_KNOBS->EXPECTED_MASTER_FITNESS || masterWorker.worker.first.locality.processId() == cluster->clusterControllerProcessId ) + if( ( masterWorker.worker.processClass.machineClassFitness( ProcessClass::Master ) > SERVER_KNOBS->EXPECTED_MASTER_FITNESS || masterWorker.worker.interf.locality.processId() == cluster->clusterControllerProcessId ) && now() - cluster->startTime < SERVER_KNOBS->WAIT_FOR_GOOD_RECRUITMENT_DELAY ) { - TraceEvent("CCWDB", cluster->id).detail("Fitness", masterWorker.worker.second.machineClassFitness( ProcessClass::Master )); + TraceEvent("CCWDB", cluster->id).detail("Fitness", masterWorker.worker.processClass.machineClassFitness( ProcessClass::Master )); wait( delay(SERVER_KNOBS->ATTEMPT_RECRUITMENT_DELAY) ); continue; } @@ -1117,9 +1115,9 @@ ACTOR Future clusterWatchDatabase( ClusterControllerData* cluster, Cluster rmq.lifetime = db->serverInfo->get().masterLifetime; rmq.forceRecovery = db->forceRecovery; - cluster->masterProcessId = masterWorker.worker.first.locality.processId(); + cluster->masterProcessId = masterWorker.worker.interf.locality.processId(); cluster->db.unfinishedRecoveries++; - state Future> fNewMaster = masterWorker.worker.first.master.tryGetReply( rmq ); + state Future> fNewMaster = masterWorker.worker.interf.master.tryGetReply( rmq ); wait( ready(fNewMaster) || db->forceMasterFailure.onTrigger() ); if (fNewMaster.isReady() && fNewMaster.get().present()) { TraceEvent("CCWDB", cluster->id).detail("Recruited", fNewMaster.get().get().id()); @@ -1296,8 +1294,8 @@ void checkOutstandingStorageRequests( ClusterControllerData* self ) { auto worker = self->getStorageWorker(req.first); RecruitStorageReply rep; - rep.worker = worker.first; - rep.processClass = worker.second; + rep.worker = worker.interf; + rep.processClass = worker.processClass; req.first.reply.send( rep ); swapAndPop( &self->outstandingStorageRequests, i-- ); } @@ -1379,7 +1377,7 @@ ACTOR Future workerAvailabilityWatch( WorkerInterface worker, ProcessClass when( wait( failed ) ) { // remove workers that have failed WorkerInfo& failedWorkerInfo = cluster->id_worker[ worker.locality.processId() ]; if (!failedWorkerInfo.reply.isSet()) { - failedWorkerInfo.reply.send( RegisterWorkerReply(failedWorkerInfo.processClass, failedWorkerInfo.priorityInfo) ); + failedWorkerInfo.reply.send( RegisterWorkerReply(failedWorkerInfo.details.processClass, failedWorkerInfo.priorityInfo) ); } cluster->id_worker.erase( worker.locality.processId() ); cluster->updateWorkerList.set( worker.locality.processId(), Optional() ); @@ -1544,8 +1542,8 @@ void clusterRecruitStorage( ClusterControllerData* self, RecruitStorageRequest r throw no_more_servers(); auto worker = self->getStorageWorker(req); RecruitStorageReply rep; - rep.worker = worker.first; - rep.processClass = worker.second; + rep.worker = worker.interf; + rep.processClass = worker.processClass; req.reply.send( rep ); } catch ( Error& e ) { if (e.code() == error_code_no_more_servers) { @@ -1636,11 +1634,11 @@ void clusterRegisterMaster( ClusterControllerData* self, RegisterMasterRequest c self->gotFullyRecoveredConfig = true; db->fullyRecoveredConfig = req.configuration.get(); for ( auto& it : self->id_worker ) { - bool isExcludedFromConfig = db->fullyRecoveredConfig.isExcludedServer(it.second.interf.address()); + bool isExcludedFromConfig = db->fullyRecoveredConfig.isExcludedServer(it.second.details.interf.address()); if ( it.second.priorityInfo.isExcluded != isExcludedFromConfig ) { it.second.priorityInfo.isExcluded = isExcludedFromConfig; if( !it.second.reply.isSet() ) { - it.second.reply.send( RegisterWorkerReply( it.second.processClass, it.second.priorityInfo ) ); + it.second.reply.send( RegisterWorkerReply( it.second.details.processClass, it.second.priorityInfo ) ); } } } @@ -1727,7 +1725,7 @@ void registerWorker( RegisterWorkerRequest req, ClusterControllerData *self ) { } // Check process class and exclusive property - if ( info == self->id_worker.end() || info->second.interf.id() != w.id() || req.generation >= info->second.gen ) { + if ( info == self->id_worker.end() || info->second.details.interf.id() != w.id() || req.generation >= info->second.gen ) { if ( self->gotProcessClasses ) { auto classIter = self->id_class.find(w.locality.processId()); @@ -1755,23 +1753,23 @@ void registerWorker( RegisterWorkerRequest req, ClusterControllerData *self ) { self->db.setDistributor( di ); } if( info == self->id_worker.end() ) { - self->id_worker[w.locality.processId()] = WorkerInfo( workerAvailabilityWatch( w, newProcessClass, self ), req.reply, req.generation, w, req.initialClass, newProcessClass, newPriorityInfo ); + self->id_worker[w.locality.processId()] = WorkerInfo( workerAvailabilityWatch( w, newProcessClass, self ), req.reply, req.generation, w, req.initialClass, newProcessClass, newPriorityInfo, false ); checkOutstandingRequests( self ); return; } - if( info->second.interf.id() != w.id() || req.generation >= info->second.gen ) { + if( info->second.details.interf.id() != w.id() || req.generation >= info->second.gen ) { if (!info->second.reply.isSet()) { info->second.reply.send( Never() ); } info->second.reply = req.reply; - info->second.processClass = newProcessClass; + info->second.details.processClass = newProcessClass; info->second.priorityInfo = newPriorityInfo; info->second.initialClass = req.initialClass; info->second.gen = req.generation; - if(info->second.interf.id() != w.id()) { - info->second.interf = w; + if(info->second.details.interf.id() != w.id()) { + info->second.details.interf = w; info->second.watcher = workerAvailabilityWatch( w, newProcessClass, self ); } checkOutstandingRequests( self ); @@ -1893,9 +1891,9 @@ ACTOR Future statusServer(FutureStream< StatusRequest> requests, } // Get status but trap errors to send back to client. - vector> workers; + vector workers; for(auto& it : self->id_worker) - workers.push_back(std::make_pair(it.second.interf, it.second.processClass)); + workers.push_back(it.second.details); std::vector incompatibleConnections; for(auto it = self->db.incompatibleConnections.begin(); it != self->db.incompatibleConnections.end();) { @@ -1992,11 +1990,11 @@ ACTOR Future monitorProcessClasses(ClusterControllerData *self) { } - if (newProcessClass != w.second.processClass) { - w.second.processClass = newProcessClass; + if (newProcessClass != w.second.details.processClass) { + w.second.details.processClass = newProcessClass; w.second.priorityInfo.processClassFitness = newProcessClass.machineClassFitness(ProcessClass::ClusterController); if (!w.second.reply.isSet()) { - w.second.reply.send( RegisterWorkerReply(w.second.processClass, w.second.priorityInfo) ); + w.second.reply.send( RegisterWorkerReply(w.second.details.processClass, w.second.priorityInfo) ); } } } @@ -2098,14 +2096,14 @@ ACTOR Future updatedChangingDatacenters(ClusterControllerData *self) { self->changingDcIds.set(std::make_pair(false,self->desiredDcIds.get())); } else { auto& worker = self->id_worker[self->clusterControllerProcessId]; - uint8_t newFitness = ClusterControllerPriorityInfo::calculateDCFitness( worker.interf.locality.dcId(), self->desiredDcIds.get().get() ); + uint8_t newFitness = ClusterControllerPriorityInfo::calculateDCFitness( worker.details.interf.locality.dcId(), self->desiredDcIds.get().get() ); self->changingDcIds.set(std::make_pair(worker.priorityInfo.dcFitness > newFitness,self->desiredDcIds.get())); TraceEvent("UpdateChangingDatacenter", self->id).detail("OldFitness", worker.priorityInfo.dcFitness).detail("NewFitness", newFitness); if ( worker.priorityInfo.dcFitness > newFitness ) { worker.priorityInfo.dcFitness = newFitness; if(!worker.reply.isSet()) { - worker.reply.send( RegisterWorkerReply( worker.processClass, worker.priorityInfo ) ); + worker.reply.send( RegisterWorkerReply( worker.details.processClass, worker.priorityInfo ) ); } } else { state int currentFit = ProcessClass::BestFit; @@ -2113,12 +2111,12 @@ ACTOR Future updatedChangingDatacenters(ClusterControllerData *self) { bool updated = false; for ( auto& it : self->id_worker ) { if( ( !it.second.priorityInfo.isExcluded && it.second.priorityInfo.processClassFitness == currentFit ) || currentFit == ProcessClass::NeverAssign ) { - uint8_t fitness = ClusterControllerPriorityInfo::calculateDCFitness( it.second.interf.locality.dcId(), self->changingDcIds.get().second.get() ); + uint8_t fitness = ClusterControllerPriorityInfo::calculateDCFitness( it.second.details.interf.locality.dcId(), self->changingDcIds.get().second.get() ); if ( it.first != self->clusterControllerProcessId && it.second.priorityInfo.dcFitness != fitness ) { updated = true; it.second.priorityInfo.dcFitness = fitness; if(!it.second.reply.isSet()) { - it.second.reply.send( RegisterWorkerReply( it.second.processClass, it.second.priorityInfo ) ); + it.second.reply.send( RegisterWorkerReply( it.second.details.processClass, it.second.priorityInfo ) ); } } } @@ -2153,11 +2151,11 @@ ACTOR Future updatedChangedDatacenters(ClusterControllerData *self) { TraceEvent("UpdateChangedDatacenter", self->id).detail("CCFirst", self->changedDcIds.get().first); if( !self->changedDcIds.get().first ) { auto& worker = self->id_worker[self->clusterControllerProcessId]; - uint8_t newFitness = ClusterControllerPriorityInfo::calculateDCFitness( worker.interf.locality.dcId(), self->changedDcIds.get().second.get() ); + uint8_t newFitness = ClusterControllerPriorityInfo::calculateDCFitness( worker.details.interf.locality.dcId(), self->changedDcIds.get().second.get() ); if( worker.priorityInfo.dcFitness != newFitness ) { worker.priorityInfo.dcFitness = newFitness; if(!worker.reply.isSet()) { - worker.reply.send( RegisterWorkerReply( worker.processClass, worker.priorityInfo ) ); + worker.reply.send( RegisterWorkerReply( worker.details.processClass, worker.priorityInfo ) ); } } } else { @@ -2166,12 +2164,12 @@ ACTOR Future updatedChangedDatacenters(ClusterControllerData *self) { bool updated = false; for ( auto& it : self->id_worker ) { if( ( !it.second.priorityInfo.isExcluded && it.second.priorityInfo.processClassFitness == currentFit ) || currentFit == ProcessClass::NeverAssign ) { - uint8_t fitness = ClusterControllerPriorityInfo::calculateDCFitness( it.second.interf.locality.dcId(), self->changedDcIds.get().second.get() ); + uint8_t fitness = ClusterControllerPriorityInfo::calculateDCFitness( it.second.details.interf.locality.dcId(), self->changedDcIds.get().second.get() ); if ( it.first != self->clusterControllerProcessId && it.second.priorityInfo.dcFitness != fitness ) { updated = true; it.second.priorityInfo.dcFitness = fitness; if(!it.second.reply.isSet()) { - it.second.reply.send( RegisterWorkerReply( it.second.processClass, it.second.priorityInfo ) ); + it.second.reply.send( RegisterWorkerReply( it.second.details.processClass, it.second.priorityInfo ) ); } } } @@ -2323,11 +2321,11 @@ ACTOR Future startDataDistributor( ClusterControllerDa std::map>, int> id_used = self->getUsedIds(); state WorkerFitnessInfo data_distributor = self->getWorkerForRoleInDatacenter(dcId, ProcessClass::DataDistributor, ProcessClass::NeverAssign, self->db.config, id_used); req.reqId = g_random->randomUniqueID(); - TraceEvent("ClusterController_DataDistributorRecruit", req.reqId).detail("Addr", data_distributor.worker.first.address()); + TraceEvent("ClusterController_DataDistributorRecruit", req.reqId).detail("Addr", data_distributor.worker.interf.address()); - ErrorOr distributor = wait( data_distributor.worker.first.dataDistributor.getReplyUnlessFailedFor(req, SERVER_KNOBS->WAIT_FOR_DISTRIBUTOR_JOIN_DELAY, 0) ); + ErrorOr distributor = wait( data_distributor.worker.interf.dataDistributor.getReplyUnlessFailedFor(req, SERVER_KNOBS->WAIT_FOR_DISTRIBUTOR_JOIN_DELAY, 0) ); if (distributor.present()) { - TraceEvent("ClusterController_DataDistributorRecruited", req.reqId).detail("Addr", data_distributor.worker.first.address()); + TraceEvent("ClusterController_DataDistributorRecruited", req.reqId).detail("Addr", data_distributor.worker.interf.address()); return distributor.get(); } } @@ -2417,19 +2415,19 @@ ACTOR Future clusterControllerCore( ClusterControllerFullInterface interf, registerWorker( req, &self ); } when( GetWorkersRequest req = waitNext( interf.getWorkers.getFuture() ) ) { - vector> workers; + vector workers; auto masterAddr = self.db.serverInfo->get().master.address(); for(auto& it : self.id_worker) { - if ( (req.flags & GetWorkersRequest::NON_EXCLUDED_PROCESSES_ONLY) && self.db.config.isExcludedServer(it.second.interf.address()) ) { + if ( (req.flags & GetWorkersRequest::NON_EXCLUDED_PROCESSES_ONLY) && self.db.config.isExcludedServer(it.second.details.interf.address()) ) { continue; } - if ( (req.flags & GetWorkersRequest::TESTER_CLASS_ONLY) && it.second.processClass.classType() != ProcessClass::TesterClass ) { + if ( (req.flags & GetWorkersRequest::TESTER_CLASS_ONLY) && it.second.details.processClass.classType() != ProcessClass::TesterClass ) { continue; } - workers.push_back(std::make_pair(it.second.interf, it.second.processClass)); + workers.push_back(it.second.details); } req.reply.send( workers ); @@ -2437,8 +2435,8 @@ ACTOR Future clusterControllerCore( ClusterControllerFullInterface interf, when( GetClientWorkersRequest req = waitNext( interf.clientInterface.getClientWorkers.getFuture() ) ) { vector workers; for(auto& it : self.id_worker) { - if (it.second.processClass.classType() != ProcessClass::TesterClass) { - workers.push_back(it.second.interf.clientInterface); + if (it.second.details.processClass.classType() != ProcessClass::TesterClass) { + workers.push_back(it.second.details.interf.clientInterface); } } req.reply.send(workers); @@ -2446,7 +2444,7 @@ ACTOR Future clusterControllerCore( ClusterControllerFullInterface interf, when( wait( coordinationPingDelay ) ) { CoordinationPingMessage message(self.id, step++); for(auto& it : self.id_worker) - it.second.interf.coordinationPing.send(message); + it.second.details.interf.coordinationPing.send(message); coordinationPingDelay = delay( SERVER_KNOBS->WORKER_COORDINATION_PING_DELAY ); TraceEvent("CoordinationPingSent", self.id).detail("TimeStep", message.timeStep); } diff --git a/fdbserver/ClusterRecruitmentInterface.h b/fdbserver/ClusterRecruitmentInterface.h index aefbe167c8..b81955e99e 100644 --- a/fdbserver/ClusterRecruitmentInterface.h +++ b/fdbserver/ClusterRecruitmentInterface.h @@ -184,7 +184,7 @@ struct GetWorkersRequest { enum { TESTER_CLASS_ONLY = 0x1, NON_EXCLUDED_PROCESSES_ONLY = 0x2 }; int flags; - ReplyPromise>> reply; + ReplyPromise> reply; GetWorkersRequest() : flags(0) {} explicit GetWorkersRequest(int fl) : flags(fl) {} diff --git a/fdbserver/QuietDatabase.actor.cpp b/fdbserver/QuietDatabase.actor.cpp index 7e5c0a0fb4..4bdee29350 100644 --- a/fdbserver/QuietDatabase.actor.cpp +++ b/fdbserver/QuietDatabase.actor.cpp @@ -30,10 +30,10 @@ #include "fdbclient/ManagementAPI.actor.h" #include "flow/actorcompiler.h" // This must be the last #include. -ACTOR Future>> getWorkers( Reference> dbInfo, int flags = 0 ) { +ACTOR Future> getWorkers( Reference> dbInfo, int flags = 0 ) { loop { choose { - when( vector> w = wait( brokenPromiseToNever( dbInfo->get().clusterInterface.getWorkers.getReply( GetWorkersRequest( flags ) ) ) ) ) { + when( vector w = wait( brokenPromiseToNever( dbInfo->get().clusterInterface.getWorkers.getReply( GetWorkersRequest( flags ) ) ) ) ) { return w; } when( wait( dbInfo->onChange() ) ) {} @@ -46,12 +46,12 @@ ACTOR Future getMasterWorker( Database cx, Reference> workers = wait( getWorkers( dbInfo ) ); + state vector workers = wait( getWorkers( dbInfo ) ); for( int i = 0; i < workers.size(); i++ ) { - if( workers[i].first.address() == dbInfo->get().master.address() ) { - TraceEvent("GetMasterWorker").detail("Stage", "GotWorkers").detail("MasterId", dbInfo->get().master.id()).detail("WorkerId", workers[i].first.id()); - return workers[i].first; + if( workers[i].interf.address() == dbInfo->get().master.address() ) { + TraceEvent("GetMasterWorker").detail("Stage", "GotWorkers").detail("MasterId", dbInfo->get().master.id()).detail("WorkerId", workers[i].interf.id()); + return workers[i].interf; } } @@ -69,15 +69,15 @@ ACTOR Future getDataDistributorWorker( Database cx, Reference> workers = wait( getWorkers( dbInfo ) ); + state vector workers = wait( getWorkers( dbInfo ) ); if (!dbInfo->get().distributor.present()) continue; for( int i = 0; i < workers.size(); i++ ) { - if( workers[i].first.address() == dbInfo->get().distributor.get().address() ) { + if( workers[i].interf.address() == dbInfo->get().distributor.get().address() ) { TraceEvent("GetDataDistributorWorker").detail("Stage", "GotWorkers") .detail("DataDistributorId", dbInfo->get().distributor.get().id()) - .detail("WorkerId", workers[i].first.id()); - return workers[i].first; + .detail("WorkerId", workers[i].interf.id()); + return workers[i].interf; } } @@ -128,10 +128,10 @@ int64_t getQueueSize( const TraceEventFields& md ) { ACTOR Future getMaxTLogQueueSize( Database cx, Reference> dbInfo ) { TraceEvent("MaxTLogQueueSize").detail("Stage", "ContactingLogs"); - state std::vector> workers = wait(getWorkers(dbInfo)); + state std::vector workers = wait(getWorkers(dbInfo)); std::map workersMap; for(auto worker : workers) { - workersMap[worker.first.address()] = worker.first; + workersMap[worker.interf.address()] = worker.interf; } state std::vector> messages; @@ -189,14 +189,14 @@ ACTOR Future getMaxStorageServerQueueSize( Database cx, Reference> serversFuture = getStorageServers(cx); - state Future>> workersFuture = getWorkers(dbInfo); + state Future> workersFuture = getWorkers(dbInfo); state std::vector servers = wait(serversFuture); - state std::vector> workers = wait(workersFuture); + state std::vector workers = wait(workersFuture); std::map workersMap; for(auto worker : workers) { - workersMap[worker.first.address()] = worker.first; + workersMap[worker.interf.address()] = worker.interf; } state std::vector> messages; diff --git a/fdbserver/QuietDatabase.h b/fdbserver/QuietDatabase.h index 175d267c37..50a501fd40 100644 --- a/fdbserver/QuietDatabase.h +++ b/fdbserver/QuietDatabase.h @@ -35,7 +35,7 @@ Future getDataDistributionQueueSize( Database const &cx, Reference getTeamCollectionValid(Database const& cx, WorkerInterface const&); Future getTeamCollectionValid(Database const& cx, Reference> const&); Future> getStorageServers( Database const& cx, bool const &use_system_priority = false); -Future>> getWorkers( Reference> const& dbInfo, int const& flags = 0 ); +Future> getWorkers( Reference> const& dbInfo, int const& flags = 0 ); Future getMasterWorker( Database const& cx, Reference> const& dbInfo ); Future repairDeadDatacenter(Database const& cx, Reference> const& dbInfo, std::string const& context); diff --git a/fdbserver/Status.actor.cpp b/fdbserver/Status.actor.cpp index 165b77c01e..40459cd722 100644 --- a/fdbserver/Status.actor.cpp +++ b/fdbserver/Status.actor.cpp @@ -100,12 +100,12 @@ ACTOR static Future< Optional > latestEventOnWorker(WorkerInte } } -ACTOR static Future< Optional< std::pair> > > latestEventOnWorkers(std::vector> workers, std::string eventName) { +ACTOR static Future< Optional< std::pair> > > latestEventOnWorkers(std::vector workers, std::string eventName) { try { state vector>> eventTraces; for (int c = 0; c < workers.size(); c++) { EventLogRequest req = eventName.size() > 0 ? EventLogRequest(Standalone(eventName)) : EventLogRequest(); - eventTraces.push_back(errorOr(timeoutError(workers[c].first.eventLogRequest.getReply(req), 2.0))); + eventTraces.push_back(errorOr(timeoutError(workers[c].interf.eventLogRequest.getReply(req), 2.0))); } wait(waitForAll(eventTraces)); @@ -116,11 +116,11 @@ ACTOR static Future< Optional< std::pair> > for (int i = 0; i < eventTraces.size(); i++) { const ErrorOr& v = eventTraces[i].get(); if (v.isError()){ - failed.insert(workers[i].first.address().toString()); - results[workers[i].first.address()] = TraceEventFields(); + failed.insert(workers[i].interf.address().toString()); + results[workers[i].interf.address()] = TraceEventFields(); } else { - results[workers[i].first.address()] = v.get(); + results[workers[i].interf.address()] = v.get(); } } @@ -135,26 +135,26 @@ ACTOR static Future< Optional< std::pair> > throw; } } -static Future< Optional< std::pair> > > latestErrorOnWorkers(std::vector> workers) { +static Future< Optional< std::pair> > > latestErrorOnWorkers(std::vector workers) { return latestEventOnWorkers( workers, "" ); } -static Optional> getWorker(std::vector> const& workers, NetworkAddress const& address) { +static Optional getWorker(std::vector const& workers, NetworkAddress const& address) { try { for (int c = 0; c < workers.size(); c++) - if (address == workers[c].first.address()) + if (address == workers[c].interf.address()) return workers[c]; - return Optional>(); + return Optional(); } catch (Error &e){ - return Optional>(); + return Optional(); } } -static Optional> getWorker(std::map> const& workersMap, NetworkAddress const& address) { +static Optional getWorker(std::map const& workersMap, NetworkAddress const& address) { auto itr = workersMap.find(address); if(itr == workersMap.end()) { - return Optional>(); + return Optional(); } return itr->second; @@ -261,7 +261,7 @@ static JsonBuilderObject getError(const TraceEventFields& errorFields) { return statusObj; } -static JsonBuilderObject machineStatusFetcher(WorkerEvents mMetrics, vector> workers, Optional configuration, std::set *incomplete_reasons) { +static JsonBuilderObject machineStatusFetcher(WorkerEvents mMetrics, vector workers, Optional configuration, std::set *incomplete_reasons) { JsonBuilderObject machineMap; double metric; int failed = 0; @@ -274,9 +274,9 @@ static JsonBuilderObject machineStatusFetcher(WorkerEvents mMetrics, vector machineJsonMap; for (auto const& worker : workers){ - locality[worker.first.address()] = worker.first.locality; - if (worker.first.locality.dcId().present()) - dcIds[worker.first.address()] = worker.first.locality.dcId().get().printable(); + locality[worker.interf.address()] = worker.interf.locality; + if (worker.interf.locality.dcId().present()) + dcIds[worker.interf.address()] = worker.interf.locality.dcId().get().printable(); } for(auto it = mMetrics.begin(); it != mMetrics.end(); it++) { @@ -540,7 +540,7 @@ struct RolesInfo { ACTOR static Future processStatusFetcher( Reference> db, - std::vector> workers, + std::vector workers, WorkerEvents pMetrics, WorkerEvents mMetrics, WorkerEvents errors, @@ -581,13 +581,13 @@ ACTOR static Future processStatusFetcher( } state std::map>, MachineMemoryInfo> machineMemoryUsage; - state std::vector>::iterator workerItr; + state std::vector::iterator workerItr; for(workerItr = workers.begin(); workerItr != workers.end(); ++workerItr) { wait(yield()); - state std::map>, MachineMemoryInfo>::iterator memInfo = machineMemoryUsage.insert(std::make_pair(workerItr->first.locality.machineId(), MachineMemoryInfo())).first; + state std::map>, MachineMemoryInfo>::iterator memInfo = machineMemoryUsage.insert(std::make_pair(workerItr->interf.locality.machineId(), MachineMemoryInfo())).first; try { - ASSERT(pMetrics.count(workerItr->first.address())); - const TraceEventFields& processMetrics = pMetrics[workerItr->first.address()]; + ASSERT(pMetrics.count(workerItr->interf.address())); + const TraceEventFields& processMetrics = pMetrics[workerItr->interf.address()]; if(memInfo->second.valid()) { if(processMetrics.size() > 0) { @@ -647,10 +647,10 @@ ACTOR static Future processStatusFetcher( wait(yield()); state JsonBuilderObject statusObj; try { - ASSERT(pMetrics.count(workerItr->first.address())); + ASSERT(pMetrics.count(workerItr->interf.address())); - NetworkAddress address = workerItr->first.address(); - const TraceEventFields& event = pMetrics[workerItr->first.address()]; + NetworkAddress address = workerItr->interf.address(); + const TraceEventFields& event = pMetrics[workerItr->interf.address()]; statusObj["address"] = address.toString(); JsonBuilderObject memoryObj; @@ -661,7 +661,7 @@ ACTOR static Future processStatusFetcher( std::string MachineID = event.getValue("MachineID"); statusObj["machine_id"] = MachineID; - statusObj["locality"] = getLocalityInfo(workerItr->first.locality); + statusObj["locality"] = getLocalityInfo(workerItr->interf.locality); statusObj.setKeyRawNumber("uptime_seconds",event.getValue("UptimeSeconds")); @@ -750,7 +750,7 @@ ACTOR static Future processStatusFetcher( double availableMemory; availableMemory = mMetrics[address].getDouble("AvailableMemory"); - auto machineMemInfo = machineMemoryUsage[workerItr->first.locality.machineId()]; + auto machineMemInfo = machineMemoryUsage[workerItr->interf.locality.machineId()]; if (machineMemInfo.valid()) { ASSERT(machineMemInfo.numProcesses > 0); int64_t memory = (availableMemory + machineMemInfo.memoryUsage) / machineMemInfo.numProcesses; @@ -794,8 +794,8 @@ ACTOR static Future processStatusFetcher( statusObj["excluded"] = configuration.get().isExcludedServer(address); } - statusObj["class_type"] = workerItr->second.toString(); - statusObj["class_source"] = workerItr->second.sourceString(); + statusObj["class_type"] = workerItr->processClass.toString(); + statusObj["class_source"] = workerItr->processClass.sourceString(); } catch (Error& e){ @@ -803,7 +803,7 @@ ACTOR static Future processStatusFetcher( incomplete_reasons->insert("Cannot retrieve all process status information."); } - processMap[printable(workerItr->first.locality.processId())] = statusObj; + processMap[printable(workerItr->interf.locality.processId())] = statusObj; } return processMap; } @@ -847,11 +847,11 @@ static JsonBuilderObject clientStatusFetcher(ClientVersionMap clientVersionMap, return clientStatus; } -ACTOR static Future recoveryStateStatusFetcher(std::pair mWorker, int workerCount, std::set *incomplete_reasons, int* statusCode) { +ACTOR static Future recoveryStateStatusFetcher(WorkerDetails mWorker, int workerCount, std::set *incomplete_reasons, int* statusCode) { state JsonBuilderObject message; try { - TraceEventFields md = wait( timeoutError(mWorker.first.eventLogRequest.getReply( EventLogRequest( LiteralStringRef("MasterRecoveryState") ) ), 1.0) ); + TraceEventFields md = wait( timeoutError(mWorker.interf.eventLogRequest.getReply( EventLogRequest( LiteralStringRef("MasterRecoveryState") ) ), 1.0) ); state int mStatusCode = md.getInt("StatusCode"); if (mStatusCode < 0 || mStatusCode >= RecoveryStatus::END) throw attribute_not_found(); @@ -1100,18 +1100,18 @@ static JsonBuilderObject configurationFetcher(Optional co return statusObj; } -ACTOR static Future dataStatusFetcher(std::pair ddWorker, int *minReplicasRemaining) { +ACTOR static Future dataStatusFetcher(WorkerDetails ddWorker, int *minReplicasRemaining) { state JsonBuilderObject statusObjData; try { std::vector> futures; // TODO: Should this be serial? - futures.push_back(timeoutError(ddWorker.first.eventLogRequest.getReply(EventLogRequest(LiteralStringRef("DDTrackerStarting"))), 1.0)); - futures.push_back(timeoutError(ddWorker.first.eventLogRequest.getReply(EventLogRequest(LiteralStringRef("DDTrackerStats"))), 1.0)); - futures.push_back(timeoutError(ddWorker.first.eventLogRequest.getReply(EventLogRequest(LiteralStringRef("MovingData"))), 1.0)); - futures.push_back(timeoutError(ddWorker.first.eventLogRequest.getReply(EventLogRequest(LiteralStringRef("TotalDataInFlight"))), 1.0)); - futures.push_back(timeoutError(ddWorker.first.eventLogRequest.getReply(EventLogRequest(LiteralStringRef("TotalDataInFlightRemote"))), 1.0)); + futures.push_back(timeoutError(ddWorker.interf.eventLogRequest.getReply(EventLogRequest(LiteralStringRef("DDTrackerStarting"))), 1.0)); + futures.push_back(timeoutError(ddWorker.interf.eventLogRequest.getReply(EventLogRequest(LiteralStringRef("DDTrackerStats"))), 1.0)); + futures.push_back(timeoutError(ddWorker.interf.eventLogRequest.getReply(EventLogRequest(LiteralStringRef("MovingData"))), 1.0)); + futures.push_back(timeoutError(ddWorker.interf.eventLogRequest.getReply(EventLogRequest(LiteralStringRef("TotalDataInFlight"))), 1.0)); + futures.push_back(timeoutError(ddWorker.interf.eventLogRequest.getReply(EventLogRequest(LiteralStringRef("TotalDataInFlightRemote"))), 1.0)); std::vector dataInfo = wait(getAll(futures)); @@ -1324,16 +1324,16 @@ ACTOR static Future>> getProxie return results; } -static int getExtraTLogEligibleMachines(const vector>& workers, const DatabaseConfiguration& configuration) { +static int getExtraTLogEligibleMachines(const vector& workers, const DatabaseConfiguration& configuration) { std::set allMachines; std::map> dcId_machine; for(auto const& worker : workers) { - if(worker.second.machineClassFitness(ProcessClass::TLog) < ProcessClass::NeverAssign - && !configuration.isExcludedServer(worker.first.address())) + if(worker.processClass.machineClassFitness(ProcessClass::TLog) < ProcessClass::NeverAssign + && !configuration.isExcludedServer(worker.interf.address())) { - allMachines.insert(worker.first.locality.zoneId().get()); - if(worker.first.locality.dcId().present()) { - dcId_machine[worker.first.locality.dcId().get()].insert(worker.first.locality.zoneId().get()); + allMachines.insert(worker.interf.locality.zoneId().get()); + if(worker.interf.locality.dcId().present()) { + dcId_machine[worker.interf.locality.dcId().get()].insert(worker.interf.locality.zoneId().get()); } } } @@ -1387,7 +1387,7 @@ JsonBuilderObject getPerfLimit(TraceEventFields const& ratekeeper, double transP return perfLimit; } -ACTOR static Future workloadStatusFetcher(Reference> db, vector> workers, std::pair mWorker, std::pair ddWorker, +ACTOR static Future workloadStatusFetcher(Reference> db, vector workers, WorkerDetails mWorker, WorkerDetails ddWorker, JsonBuilderObject *qos, JsonBuilderObject *data_overlay, std::set *incomplete_reasons, Future>>> storageServerFuture) { state JsonBuilderObject statusObj; @@ -1398,14 +1398,14 @@ ACTOR static Future workloadStatusFetcher(Reference> proxyStatFutures; - std::map> workersMap; + std::map workersMap; for (auto const& w : workers) { - workersMap[w.first.address()] = w; + workersMap[w.interf.address()] = w; } for (auto &p : db->get().client.proxies) { auto worker = getWorker(workersMap, p.address()); if (worker.present()) - proxyStatFutures.push_back(timeoutError(worker.get().first.eventLogRequest.getReply(EventLogRequest(LiteralStringRef("ProxyMetrics"))), 1.0)); + proxyStatFutures.push_back(timeoutError(worker.get().interf.eventLogRequest.getReply(EventLogRequest(LiteralStringRef("ProxyMetrics"))), 1.0)); else throw all_alternatives_failed(); // We need data from all proxies for this result to be trustworthy } @@ -1439,8 +1439,8 @@ ACTOR static Future workloadStatusFetcher(Reference>& workers, int extraTlogEligibleMachines, int minReplicasRemaining) { +static JsonBuilderObject faultToleranceStatusFetcher(DatabaseConfiguration configuration, ServerCoordinators coordinators, std::vector& workers, int extraTlogEligibleMachines, int minReplicasRemaining) { JsonBuilderObject statusObj; // without losing data @@ -1605,7 +1605,7 @@ static JsonBuilderObject faultToleranceStatusFetcher(DatabaseConfiguration confi std::map workerZones; for(auto& worker : workers) { - workerZones[worker.first.address()] = worker.first.locality.zoneId().orDefault(LiteralStringRef("")); + workerZones[worker.interf.address()] = worker.interf.locality.zoneId().orDefault(LiteralStringRef("")); } std::map coordinatorZoneCounts; for(auto& coordinator : coordinators.ccf->getConnectionString().coordinators()) { @@ -1800,7 +1800,7 @@ ACTOR Future lockedStatusFetcher(Reference clusterGetStatus( Reference> db, Database cx, - vector> workers, + vector workers, ProcessIssuesMap workerIssues, ProcessIssuesMap clientIssues, ClientVersionMap clientVersionMap, @@ -1814,19 +1814,19 @@ ACTOR Future clusterGetStatus( // Check if master worker is present state JsonBuilderArray messages; state std::set status_incomplete_reasons; - state std::pair mWorker; - state std::pair ddWorker; // DataDistributor worker + state WorkerDetails mWorker; + state WorkerDetails ddWorker; // DataDistributor worker try { // Get the master Worker interface - Optional> _mWorker = getWorker( workers, db->get().master.address() ); + Optional _mWorker = getWorker( workers, db->get().master.address() ); if (_mWorker.present()) { mWorker = _mWorker.get(); } else { messages.push_back(JsonString::makeMessage("unreachable_master_worker", "Unable to locate the master worker.")); } // Get the DataDistributor worker interface - Optional> _ddWorker; + Optional _ddWorker; if (db->get().distributor.present()) { _ddWorker = getWorker( workers, db->get().distributor.get().address() ); } @@ -1930,7 +1930,7 @@ ACTOR Future clusterGetStatus( // in status output is important to give context to error messages in status that reference a storage server role ID. state std::unordered_map address_workers; for (auto const& worker : workers) { - address_workers[worker.first.address()] = worker.first; + address_workers[worker.interf.address()] = worker.interf; } state Future>>> storageServerFuture = errorOr(getStorageServersAndMetrics(cx, address_workers)); diff --git a/fdbserver/Status.h b/fdbserver/Status.h index 9697d3d77d..116e5fc2d3 100644 --- a/fdbserver/Status.h +++ b/fdbserver/Status.h @@ -30,7 +30,7 @@ typedef std::map< NetworkAddress, std::pair > ProcessIssuesMap; typedef std::map< NetworkAddress, Standalone> > ClientVersionMap; -Future clusterGetStatus( Reference> const& db, Database const& cx, vector> const& workers, +Future clusterGetStatus( Reference> const& db, Database const& cx, vector const& workers, ProcessIssuesMap const& workerIssues, ProcessIssuesMap const& clientIssues, ClientVersionMap const& clientVersionMap, std::map const& traceLogGroupMap, ServerCoordinators const& coordinators, std::vector const& incompatibleConnections, Version const& datacenterVersionDifference ); diff --git a/fdbserver/WorkerInterface.actor.h b/fdbserver/WorkerInterface.actor.h index c3f6b1bb49..e339706415 100644 --- a/fdbserver/WorkerInterface.actor.h +++ b/fdbserver/WorkerInterface.actor.h @@ -72,6 +72,20 @@ struct WorkerInterface { } }; +struct WorkerDetails { + WorkerInterface interf; + ProcessClass processClass; + bool degraded; + + WorkerDetails() : degraded(false) {} + WorkerDetails(const WorkerInterface& interf, ProcessClass processClass, bool degraded) : interf(interf), processClass(processClass), degraded(degraded) {} + + template + void serialize(Ar& ar) { + serializer(ar, interf, processClass, degraded); + } +}; + struct InitializeTLogRequest { UID recruitmentID; LogSystemConfig recoverFrom; diff --git a/fdbserver/tester.actor.cpp b/fdbserver/tester.actor.cpp index 135053f0e6..7dd66ccc99 100644 --- a/fdbserver/tester.actor.cpp +++ b/fdbserver/tester.actor.cpp @@ -1095,11 +1095,11 @@ ACTOR Future runTests( Reference testerTimeout = delay(600.0); // wait 600 sec for testers to show up - state vector> workers; + state vector workers; loop { choose { - when( vector> w = wait( cc->get().present() ? brokenPromiseToNever( cc->get().get().getWorkers.getReply( GetWorkersRequest( flags ) ) ) : Never() ) ) { + when( vector w = wait( cc->get().present() ? brokenPromiseToNever( cc->get().get().getWorkers.getReply( GetWorkersRequest( flags ) ) ) : Never() ) ) { if (w.size() >= minTestersExpected) { workers = w; break; @@ -1116,7 +1116,7 @@ ACTOR Future runTests( Reference ts; for(int i=0; i checkForStorage(Database cx, DatabaseConfiguration configuration, ConsistencyCheckWorkload *self) { - state vector> workers = wait( getWorkers( self->dbInfo ) ); + state vector workers = wait( getWorkers( self->dbInfo ) ); state vector storageServers = wait( getStorageServers( cx ) ); std::set> missingStorage; for( int i = 0; i < workers.size(); i++ ) { - if( !configuration.isExcludedServer(workers[i].first.address()) && - ( workers[i].second == ProcessClass::StorageClass || workers[i].second == ProcessClass::UnsetClass ) ) { + if( !configuration.isExcludedServer(workers[i].interf.address()) && + ( workers[i].processClass == ProcessClass::StorageClass || workers[i].processClass == ProcessClass::UnsetClass ) ) { bool found = false; for( int j = 0; j < storageServers.size(); j++ ) { - if( storageServers[j].address() == workers[i].first.address() ) { + if( storageServers[j].address() == workers[i].interf.address() ) { found = true; break; } } if( !found ) { TraceEvent("ConsistencyCheck_NoStorage") - .detail("Address", workers[i].first.address()) + .detail("Address", workers[i].interf.address()) .detail("ProcessClassEqualToStorageClass", - (int)(workers[i].second == ProcessClass::StorageClass)); - missingStorage.insert(workers[i].first.locality.dcId()); + (int)(workers[i].processClass == ProcessClass::StorageClass)); + missingStorage.insert(workers[i].interf.locality.dcId()); } } } @@ -1125,12 +1125,12 @@ struct ConsistencyCheckWorkload : TestWorkload } ACTOR Future checkForExtraDataStores(Database cx, ConsistencyCheckWorkload *self) { - state vector> workers = wait( getWorkers( self->dbInfo ) ); + state vector workers = wait( getWorkers( self->dbInfo ) ); state vector storageServers = wait( getStorageServers( cx ) ); auto& db = self->dbInfo->get(); state std::vector logs = db.logSystemConfig.allPresentLogs(); - state std::vector>::iterator itr; + state std::vector::iterator itr; state bool foundExtraDataStore = false; state std::map> statefulProcesses; @@ -1142,19 +1142,19 @@ struct ConsistencyCheckWorkload : TestWorkload } for(itr = workers.begin(); itr != workers.end(); ++itr) { - ErrorOr>> stores = wait(itr->first.diskStoreRequest.getReplyUnlessFailedFor(DiskStoreRequest(false), 2, 0)); + ErrorOr>> stores = wait(itr->interf.diskStoreRequest.getReplyUnlessFailedFor(DiskStoreRequest(false), 2, 0)); if(stores.isError()) { - TraceEvent("ConsistencyCheck_GetDataStoreFailure").error(stores.getError()).detail("Address", itr->first.address()); + TraceEvent("ConsistencyCheck_GetDataStoreFailure").error(stores.getError()).detail("Address", itr->interf.address()); self->testFailure("Failed to get data stores"); return false; } for(auto id : stores.get()) { - if(!statefulProcesses[itr->first.address()].count(id)) { - TraceEvent("ConsistencyCheck_ExtraDataStore").detail("Address", itr->first.address()).detail("DataStoreID", id); + if(!statefulProcesses[itr->interf.address()].count(id)) { + TraceEvent("ConsistencyCheck_ExtraDataStore").detail("Address", itr->interf.address()).detail("DataStoreID", id); if(g_network->isSimulated()) { - TraceEvent("ConsistencyCheck_RebootProcess").detail("Address", itr->first.address()).detail("DataStoreID", id); - g_simulator.rebootProcess(g_simulator.getProcessByAddress(itr->first.address()), ISimulator::RebootProcess); + TraceEvent("ConsistencyCheck_RebootProcess").detail("Address", itr->interf.address()).detail("DataStoreID", id); + g_simulator.rebootProcess(g_simulator.getProcessByAddress(itr->interf.address()), ISimulator::RebootProcess); } foundExtraDataStore = true; @@ -1172,17 +1172,17 @@ struct ConsistencyCheckWorkload : TestWorkload //Returns true if the worker at the given address has the specified machineClass or has an unset class //The interfaceType paramater is used in a TraceEvent, should be something like (Master, MasterProxy, StorageServer, ...) - bool workerHasClass(vector> workers, NetworkAddress address, ProcessClass::ClassType machineClass, std::string interfaceType) + bool workerHasClass(vector workers, NetworkAddress address, ProcessClass::ClassType machineClass, std::string interfaceType) { //Search all workers until the correct one is found for(int i = 0; i < workers.size(); i++) { - if(workers[i].first.address() == address) + if(workers[i].interf.address() == address) { - if(workers[i].second == machineClass || workers[i].second == ProcessClass::UnsetClass) + if(workers[i].processClass == machineClass || workers[i].processClass == ProcessClass::UnsetClass) return true; - TraceEvent("ConsistencyCheck_InvalidClassType").detail("RequestedClass", workers[i].second.toString()) + TraceEvent("ConsistencyCheck_InvalidClassType").detail("RequestedClass", workers[i].processClass.toString()) .detail("ActualClass", ProcessClass(machineClass, ProcessClass::CommandLineSource).toString()).detail("InterfaceType", interfaceType); return false; @@ -1200,16 +1200,16 @@ struct ConsistencyCheckWorkload : TestWorkload if(g_simulator.extraDB) return true; - vector> workers = wait( getWorkers( self->dbInfo ) ); + vector workers = wait( getWorkers( self->dbInfo ) ); std::set workerAddresses; for( auto it : workers ) { - ISimulator::ProcessInfo* info = g_simulator.getProcessByAddress(it.first.address()); + ISimulator::ProcessInfo* info = g_simulator.getProcessByAddress(it.interf.address()); if(!info || info->failed) { - TraceEvent("ConsistencyCheck_FailedWorkerInList").detail("Addr", it.first.address()); + TraceEvent("ConsistencyCheck_FailedWorkerInList").detail("Addr", it.interf.address()); return false; } - workerAddresses.insert( NetworkAddress(it.first.address().ip, it.first.address().port, true, false) ); + workerAddresses.insert( NetworkAddress(it.interf.address().ip, it.interf.address().port, true, false) ); } vector all = g_simulator.getAllProcesses(); @@ -1281,34 +1281,33 @@ struct ConsistencyCheckWorkload : TestWorkload } } - typedef std::pair WorkerClassPair; //Returns true if all machines in the cluster that specified a desired class are operating in that class ACTOR Future checkUsingDesiredClasses(Database cx, ConsistencyCheckWorkload *self) { state Optional expectedPrimaryDcId; state Optional expectedRemoteDcId; state DatabaseConfiguration config = wait(getDatabaseConfiguration(cx)); - state vector allWorkers = wait(getWorkers(self->dbInfo)); - state vector nonExcludedWorkers = wait(getWorkers(self->dbInfo, GetWorkersRequest::NON_EXCLUDED_PROCESSES_ONLY)); + state vector allWorkers = wait(getWorkers(self->dbInfo)); + state vector nonExcludedWorkers = wait(getWorkers(self->dbInfo, GetWorkersRequest::NON_EXCLUDED_PROCESSES_ONLY)); auto& db = self->dbInfo->get(); - std::map allWorkerProcessMap; + std::map allWorkerProcessMap; std::map, std::vector> dcToAllClassTypes; for (auto worker : allWorkers) { - allWorkerProcessMap[worker.first.address()] = worker; - Optional dc = worker.first.locality._data[LocalityData::keyDcId]; + allWorkerProcessMap[worker.interf.address()] = worker; + Optional dc = worker.interf.locality._data[LocalityData::keyDcId]; if (!dcToAllClassTypes.count(dc)) dcToAllClassTypes.insert({}); - dcToAllClassTypes[dc].push_back(worker.second.classType()); + dcToAllClassTypes[dc].push_back(worker.processClass.classType()); } - std::map nonExcludedWorkerProcessMap; + std::map nonExcludedWorkerProcessMap; std::map, std::vector> dcToNonExcludedClassTypes; for (auto worker : nonExcludedWorkers) { - nonExcludedWorkerProcessMap[worker.first.address()] = worker; - Optional dc = worker.first.locality._data[LocalityData::keyDcId]; + nonExcludedWorkerProcessMap[worker.interf.address()] = worker; + Optional dc = worker.interf.locality._data[LocalityData::keyDcId]; if (!dcToNonExcludedClassTypes.count(dc)) dcToNonExcludedClassTypes.insert({}); - dcToNonExcludedClassTypes[dc].push_back(worker.second.classType()); + dcToNonExcludedClassTypes[dc].push_back(worker.processClass.classType()); } if (!allWorkerProcessMap.count(db.clusterInterface.clientInterface.address())) { @@ -1320,8 +1319,8 @@ struct ConsistencyCheckWorkload : TestWorkload return false; } - Optional ccDcId = allWorkerProcessMap[db.clusterInterface.clientInterface.address()].first.locality._data[LocalityData::keyDcId]; - Optional masterDcId = allWorkerProcessMap[db.master.address()].first.locality._data[LocalityData::keyDcId]; + Optional ccDcId = allWorkerProcessMap[db.clusterInterface.clientInterface.address()].interf.locality._data[LocalityData::keyDcId]; + Optional masterDcId = allWorkerProcessMap[db.master.address()].interf.locality._data[LocalityData::keyDcId]; if (ccDcId != masterDcId) { TraceEvent("ConsistencyCheck_CCAndMasterNotInSameDC").detail("ClusterControllerDcId", getOptionalString(ccDcId)).detail("MasterDcId", getOptionalString(masterDcId)); @@ -1351,8 +1350,8 @@ struct ConsistencyCheckWorkload : TestWorkload // Check CC ProcessClass::Fitness bestClusterControllerFitness = getBestAvailableFitness(dcToNonExcludedClassTypes[ccDcId], ProcessClass::ClusterController); - if (!nonExcludedWorkerProcessMap.count(db.clusterInterface.clientInterface.address()) || nonExcludedWorkerProcessMap[db.clusterInterface.clientInterface.address()].second.machineClassFitness(ProcessClass::ClusterController) != bestClusterControllerFitness) { - TraceEvent("ConsistencyCheck_ClusterControllerNotBest").detail("BestClusterControllerFitness", bestClusterControllerFitness).detail("ExistingClusterControllerFit", nonExcludedWorkerProcessMap.count(db.clusterInterface.clientInterface.address()) ? nonExcludedWorkerProcessMap[db.clusterInterface.clientInterface.address()].second.machineClassFitness(ProcessClass::ClusterController) : -1); + if (!nonExcludedWorkerProcessMap.count(db.clusterInterface.clientInterface.address()) || nonExcludedWorkerProcessMap[db.clusterInterface.clientInterface.address()].processClass.machineClassFitness(ProcessClass::ClusterController) != bestClusterControllerFitness) { + TraceEvent("ConsistencyCheck_ClusterControllerNotBest").detail("BestClusterControllerFitness", bestClusterControllerFitness).detail("ExistingClusterControllerFit", nonExcludedWorkerProcessMap.count(db.clusterInterface.clientInterface.address()) ? nonExcludedWorkerProcessMap[db.clusterInterface.clientInterface.address()].processClass.machineClassFitness(ProcessClass::ClusterController) : -1); return false; } @@ -1365,16 +1364,16 @@ struct ConsistencyCheckWorkload : TestWorkload } } - if ((!nonExcludedWorkerProcessMap.count(db.master.address()) && bestMasterFitness != ProcessClass::ExcludeFit) || nonExcludedWorkerProcessMap[db.master.address()].second.machineClassFitness(ProcessClass::Master) != bestMasterFitness) { - TraceEvent("ConsistencyCheck_MasterNotBest").detail("BestMasterFitness", bestMasterFitness).detail("ExistingMasterFit", nonExcludedWorkerProcessMap.count(db.master.address()) ? nonExcludedWorkerProcessMap[db.master.address()].second.machineClassFitness(ProcessClass::Master) : -1); + if ((!nonExcludedWorkerProcessMap.count(db.master.address()) && bestMasterFitness != ProcessClass::ExcludeFit) || nonExcludedWorkerProcessMap[db.master.address()].processClass.machineClassFitness(ProcessClass::Master) != bestMasterFitness) { + TraceEvent("ConsistencyCheck_MasterNotBest").detail("BestMasterFitness", bestMasterFitness).detail("ExistingMasterFit", nonExcludedWorkerProcessMap.count(db.master.address()) ? nonExcludedWorkerProcessMap[db.master.address()].processClass.machineClassFitness(ProcessClass::Master) : -1); return false; } // Check proxy ProcessClass::Fitness bestMasterProxyFitness = getBestAvailableFitness(dcToNonExcludedClassTypes[masterDcId], ProcessClass::Proxy); for (auto masterProxy : db.client.proxies) { - if (!nonExcludedWorkerProcessMap.count(masterProxy.address()) || nonExcludedWorkerProcessMap[masterProxy.address()].second.machineClassFitness(ProcessClass::Proxy) != bestMasterProxyFitness) { - TraceEvent("ConsistencyCheck_ProxyNotBest").detail("BestMasterProxyFitness", bestMasterProxyFitness).detail("ExistingMasterProxyFitness", nonExcludedWorkerProcessMap.count(masterProxy.address()) ? nonExcludedWorkerProcessMap[masterProxy.address()].second.machineClassFitness(ProcessClass::Proxy) : -1); + if (!nonExcludedWorkerProcessMap.count(masterProxy.address()) || nonExcludedWorkerProcessMap[masterProxy.address()].processClass.machineClassFitness(ProcessClass::Proxy) != bestMasterProxyFitness) { + TraceEvent("ConsistencyCheck_ProxyNotBest").detail("BestMasterProxyFitness", bestMasterProxyFitness).detail("ExistingMasterProxyFitness", nonExcludedWorkerProcessMap.count(masterProxy.address()) ? nonExcludedWorkerProcessMap[masterProxy.address()].processClass.machineClassFitness(ProcessClass::Proxy) : -1); return false; } } @@ -1382,8 +1381,8 @@ struct ConsistencyCheckWorkload : TestWorkload // Check resolver ProcessClass::Fitness bestResolverFitness = getBestAvailableFitness(dcToNonExcludedClassTypes[masterDcId], ProcessClass::Resolver); for (auto resolver : db.resolvers) { - if (!nonExcludedWorkerProcessMap.count(resolver.address()) || nonExcludedWorkerProcessMap[resolver.address()].second.machineClassFitness(ProcessClass::Resolver) != bestResolverFitness) { - TraceEvent("ConsistencyCheck_ResolverNotBest").detail("BestResolverFitness", bestResolverFitness).detail("ExistingResolverFitness", nonExcludedWorkerProcessMap.count(resolver.address()) ? nonExcludedWorkerProcessMap[resolver.address()].second.machineClassFitness(ProcessClass::Resolver) : -1); + if (!nonExcludedWorkerProcessMap.count(resolver.address()) || nonExcludedWorkerProcessMap[resolver.address()].processClass.machineClassFitness(ProcessClass::Resolver) != bestResolverFitness) { + TraceEvent("ConsistencyCheck_ResolverNotBest").detail("BestResolverFitness", bestResolverFitness).detail("ExistingResolverFitness", nonExcludedWorkerProcessMap.count(resolver.address()) ? nonExcludedWorkerProcessMap[resolver.address()].processClass.machineClassFitness(ProcessClass::Resolver) : -1); return false; } } diff --git a/fdbserver/workloads/CpuProfiler.actor.cpp b/fdbserver/workloads/CpuProfiler.actor.cpp index a8208d61d3..8e8c8865de 100644 --- a/fdbserver/workloads/CpuProfiler.actor.cpp +++ b/fdbserver/workloads/CpuProfiler.actor.cpp @@ -69,11 +69,11 @@ struct CpuProfilerWorkload : TestWorkload //If we are turning the profiler on, get a list of workers in the system if(enabled) { - vector> _workers = wait( getWorkers( self->dbInfo ) ); + vector _workers = wait( getWorkers( self->dbInfo ) ); vector workers; for(int i = 0; i < _workers.size(); i++) { - if (self->roles.empty() || std::find(self->roles.cbegin(), self->roles.cend(), _workers[i].second.toString()) != self->roles.cend()) { - workers.push_back(_workers[i].first); + if (self->roles.empty() || std::find(self->roles.cbegin(), self->roles.cend(), _workers[i].processClass.toString()) != self->roles.cend()) { + workers.push_back(_workers[i].interf); } } self->profilingWorkers = workers; diff --git a/fdbserver/workloads/LogMetrics.actor.cpp b/fdbserver/workloads/LogMetrics.actor.cpp index 5c064e7085..b849de9c50 100644 --- a/fdbserver/workloads/LogMetrics.actor.cpp +++ b/fdbserver/workloads/LogMetrics.actor.cpp @@ -54,12 +54,12 @@ struct LogMetricsWorkload : TestWorkload { ACTOR Future setSystemRate( LogMetricsWorkload *self, Database cx, uint32_t rate ) { // set worker interval and ss interval state BinaryWriter br(Unversioned()); - vector> workers = wait( getWorkers( self->dbInfo ) ); + vector workers = wait( getWorkers( self->dbInfo ) ); //vector> replies; TraceEvent("RateChangeTrigger"); SetMetricsLogRateRequest req(rate); for(int i = 0; i < workers.size(); i++) { - workers[i].first.setMetricsRate.send( req ); + workers[i].interf.setMetricsRate.send( req ); } //wait( waitForAll( replies ) ); diff --git a/fdbserver/workloads/Performance.actor.cpp b/fdbserver/workloads/Performance.actor.cpp index 8b4f424d9e..16c6b00528 100644 --- a/fdbserver/workloads/Performance.actor.cpp +++ b/fdbserver/workloads/Performance.actor.cpp @@ -103,11 +103,11 @@ struct PerformanceWorkload : TestWorkload { //FIXME: does not use testers which are recruited on workers ACTOR Future> getTesters( PerformanceWorkload *self) { - state vector> workers; + state vector workers; loop { choose { - when( vector> w = wait( brokenPromiseToNever( self->dbInfo->get().clusterInterface.getWorkers.getReply( GetWorkersRequest( GetWorkersRequest::TESTER_CLASS_ONLY | GetWorkersRequest::NON_EXCLUDED_PROCESSES_ONLY ) ) ) ) ) { + when( vector w = wait( brokenPromiseToNever( self->dbInfo->get().clusterInterface.getWorkers.getReply( GetWorkersRequest( GetWorkersRequest::TESTER_CLASS_ONLY | GetWorkersRequest::NON_EXCLUDED_PROCESSES_ONLY ) ) ) ) ) { workers = w; break; } @@ -117,7 +117,7 @@ struct PerformanceWorkload : TestWorkload { vector ts; for(int i=0; i workerPinger( PingWorkload* self ) { - vector> workers = wait( getWorkers( self->dbInfo ) ); + vector workers = wait( getWorkers( self->dbInfo ) ); vector> peers; for(int i=0; i> pingers; for(int i=0; iactorCount; i++) pingers.push_back( self->pinger( self, peers ) ); @@ -208,9 +208,9 @@ struct PingWorkload : TestWorkload { state Future collection = actorCollection( addActor.getFuture() ); if( self->workerBroadcast ) { - vector> workers = wait( getWorkers( self->dbInfo ) ); + vector workers = wait( getWorkers( self->dbInfo ) ); for( int i=0; i peers = wait( self->fetchInterfaces( self, cx ) ); for( int i=0; i traceDumpWorkers( Reference> db ) { try { loop { - ErrorOr>> workerList = wait( db->get().clusterInterface.getWorkers.tryGetReply( GetWorkersRequest() ) ); + ErrorOr> workerList = wait( db->get().clusterInterface.getWorkers.tryGetReply( GetWorkersRequest() ) ); if( workerList.present() ) { std::vector>> dumpRequests; for( int i = 0; i < workerList.get().size(); i++) - dumpRequests.push_back( workerList.get()[i].first.traceBatchDumpRequest.tryGetReply( TraceBatchDumpRequest() ) ); + dumpRequests.push_back( workerList.get()[i].interf.traceBatchDumpRequest.tryGetReply( TraceBatchDumpRequest() ) ); wait( waitForAll( dumpRequests ) ); return true; } diff --git a/fdbserver/workloads/TargetedKill.actor.cpp b/fdbserver/workloads/TargetedKill.actor.cpp index fea3a482a7..70ada71b79 100644 --- a/fdbserver/workloads/TargetedKill.actor.cpp +++ b/fdbserver/workloads/TargetedKill.actor.cpp @@ -61,14 +61,14 @@ struct TargetedKillWorkload : TestWorkload { return Void(); } - state vector> workers = wait( getWorkers( self->dbInfo ) ); + state vector workers = wait( getWorkers( self->dbInfo ) ); int killed = 0; for( int i = 0; i < workers.size(); i++ ) { - if( workers[i].first.master.getEndpoint().getPrimaryAddress() == address || - ( self->killAllMachineProcesses && workers[i].first.master.getEndpoint().getPrimaryAddress().ip == address.ip && workers[i].second != ProcessClass::TesterClass ) ) { - TraceEvent("WorkerKill").detail("TargetedMachine", address).detail("Worker", workers[i].first.id()); - workers[i].first.clientInterface.reboot.send( RebootRequest() ); + if( workers[i].interf.master.getEndpoint().getPrimaryAddress() == address || + ( self->killAllMachineProcesses && workers[i].interf.master.getEndpoint().getPrimaryAddress().ip == address.ip && workers[i].processClass != ProcessClass::TesterClass ) ) { + TraceEvent("WorkerKill").detail("TargetedMachine", address).detail("Worker", workers[i].interf.id()); + workers[i].interf.clientInterface.reboot.send( RebootRequest() ); } } diff --git a/fdbserver/workloads/WorkerErrors.actor.cpp b/fdbserver/workloads/WorkerErrors.actor.cpp index cd63ccf35d..c4cc4a5b51 100644 --- a/fdbserver/workloads/WorkerErrors.actor.cpp +++ b/fdbserver/workloads/WorkerErrors.actor.cpp @@ -42,10 +42,10 @@ struct WorkerErrorsWorkload : TestWorkload { virtual void getMetrics( vector& m ) {} - ACTOR Future< std::vector< TraceEventFields > > latestEventOnWorkers( std::vector> workers ) { + ACTOR Future< std::vector< TraceEventFields > > latestEventOnWorkers( std::vector workers ) { state vector> eventTraces; for(int c = 0; c < workers.size(); c++) { - eventTraces.push_back( workers[c].first.eventLogRequest.getReply( EventLogRequest() ) ); + eventTraces.push_back( workers[c].interf.eventLogRequest.getReply( EventLogRequest() ) ); } wait( timeoutError( waitForAll( eventTraces ), 2.0 ) ); @@ -59,7 +59,7 @@ struct WorkerErrorsWorkload : TestWorkload { } ACTOR Future _start(Database cx, WorkerErrorsWorkload *self) { - state vector> workers = wait( getWorkers( self->dbInfo ) ); + state vector workers = wait( getWorkers( self->dbInfo ) ); std::vector errors = wait( self->latestEventOnWorkers( workers ) ); for(auto e : errors) { printf("%s\n", e.toString().c_str()); From 53f16b53475f36288a92e6f02a90eb637599e58a Mon Sep 17 00:00:00 2001 From: Evan Tschannen Date: Fri, 8 Mar 2019 11:46:34 -0500 Subject: [PATCH 14/46] when a tlog queue commit takes longer than 5 seconds, its process is marked as degraded --- fdbserver/ClusterRecruitmentInterface.h | 9 +++++---- fdbserver/Knobs.cpp | 4 +++- fdbserver/Knobs.h | 2 ++ fdbserver/OldTLogServer_6_0.actor.cpp | 23 +++++++++++++++++++---- fdbserver/TLogServer.actor.cpp | 23 +++++++++++++++++++---- fdbserver/WorkerInterface.actor.h | 4 ++-- fdbserver/worker.actor.cpp | 15 +++++++++------ 7 files changed, 59 insertions(+), 21 deletions(-) diff --git a/fdbserver/ClusterRecruitmentInterface.h b/fdbserver/ClusterRecruitmentInterface.h index b81955e99e..28f6ae4f08 100644 --- a/fdbserver/ClusterRecruitmentInterface.h +++ b/fdbserver/ClusterRecruitmentInterface.h @@ -169,14 +169,15 @@ struct RegisterWorkerRequest { Generation generation; Optional distributorInterf; ReplyPromise reply; + bool degraded; - RegisterWorkerRequest() : priorityInfo(ProcessClass::UnsetFit, false, ClusterControllerPriorityInfo::FitnessUnknown) {} - RegisterWorkerRequest(WorkerInterface wi, ProcessClass initialClass, ProcessClass processClass, ClusterControllerPriorityInfo priorityInfo, Generation generation, Optional ddInterf) : - wi(wi), initialClass(initialClass), processClass(processClass), priorityInfo(priorityInfo), generation(generation), distributorInterf(ddInterf) {} + RegisterWorkerRequest() : priorityInfo(ProcessClass::UnsetFit, false, ClusterControllerPriorityInfo::FitnessUnknown), degraded(false) {} + RegisterWorkerRequest(WorkerInterface wi, ProcessClass initialClass, ProcessClass processClass, ClusterControllerPriorityInfo priorityInfo, Generation generation, Optional ddInterf, bool degraded) : + wi(wi), initialClass(initialClass), processClass(processClass), priorityInfo(priorityInfo), generation(generation), distributorInterf(ddInterf), degraded(degraded) {} template void serialize( Ar& ar ) { - serializer(ar, wi, initialClass, processClass, priorityInfo, generation, distributorInterf, reply); + serializer(ar, wi, initialClass, processClass, priorityInfo, generation, distributorInterf, reply, degraded); } }; diff --git a/fdbserver/Knobs.cpp b/fdbserver/Knobs.cpp index 8b37bb55fb..1d94f56110 100644 --- a/fdbserver/Knobs.cpp +++ b/fdbserver/Knobs.cpp @@ -73,7 +73,9 @@ ServerKnobs::ServerKnobs(bool randomize, ClientKnobs* clientKnobs) { init( DISK_QUEUE_ADAPTER_MAX_SWITCH_TIME, 5.0 ); init( TLOG_SPILL_REFERENCE_MAX_PEEK_MEMORY_BYTES, 2e9 ); if ( randomize && BUGGIFY ) TLOG_SPILL_REFERENCE_MAX_PEEK_MEMORY_BYTES = 2e6; init( DISK_QUEUE_FILE_EXTENSION_BYTES, 10<<20 ); // BUGGIFYd per file within the DiskQueue - init( DISK_QUEUE_FILE_SHRINK_BYTES, 100<<20 ); // BUGGIFYd per file within the DiskQueue + init( DISK_QUEUE_FILE_SHRINK_BYTES, 100<<20 ); // BUGGIFYd per file within the DiskQueue + init( TLOG_DEGRADED_DELAY_COUNT, 5 ); + init( TLOG_DEGRADED_DURATION, 5.0 ); // Data distribution queue init( HEALTH_POLL_TIME, 1.0 ); diff --git a/fdbserver/Knobs.h b/fdbserver/Knobs.h index f3698b3561..f3184b3345 100644 --- a/fdbserver/Knobs.h +++ b/fdbserver/Knobs.h @@ -77,6 +77,8 @@ public: int64_t TLOG_SPILL_REFERENCE_MAX_PEEK_MEMORY_BYTES; int64_t DISK_QUEUE_FILE_EXTENSION_BYTES; // When we grow the disk queue, by how many bytes should it grow? int64_t DISK_QUEUE_FILE_SHRINK_BYTES; // When we shrink the disk queue, by how many bytes should it shrink? + int TLOG_DEGRADED_DELAY_COUNT; + double TLOG_DEGRADED_DURATION; // Data distribution queue double HEALTH_POLL_TIME; diff --git a/fdbserver/OldTLogServer_6_0.actor.cpp b/fdbserver/OldTLogServer_6_0.actor.cpp index 0d4469fe7e..36953d0542 100644 --- a/fdbserver/OldTLogServer_6_0.actor.cpp +++ b/fdbserver/OldTLogServer_6_0.actor.cpp @@ -273,10 +273,12 @@ struct TLogData : NonCopyable { FlowLock concurrentLogRouterReads; FlowLock persistentDataCommitLock; - TLogData(UID dbgid, IKeyValueStore* persistentData, IDiskQueue * persistentQueue, Reference> const& dbInfo) + Reference> degraded; + + TLogData(UID dbgid, IKeyValueStore* persistentData, IDiskQueue * persistentQueue, Reference> dbInfo, Reference> degraded) : dbgid(dbgid), instanceID(g_random->randomUniqueID().first()), persistentData(persistentData), rawPersistentQueue(persistentQueue), persistentQueue(new TLogQueue(persistentQueue, dbgid)), - dbInfo(dbInfo), queueCommitBegin(0), queueCommitEnd(0), + dbInfo(dbInfo), degraded(degraded), queueCommitBegin(0), queueCommitEnd(0), diskQueueCommitBytes(0), largeDiskQueueCommitBytes(false), bytesInput(0), bytesDurable(0), overheadBytesInput(0), overheadBytesDurable(0), concurrentLogRouterReads(SERVER_KNOBS->CONCURRENT_LOG_ROUTER_READS) { @@ -1087,6 +1089,17 @@ ACTOR Future tLogPeekMessages( TLogData* self, TLogPeekRequest req, Refere return Void(); } +ACTOR Future watchDegraded(TLogData* self) { + //This delay is divided into multiple delays to avoid marking the tlog as degraded because of a single SlowTask + state int loopCount = 0; + while(loopCount < SERVER_KNOBS->TLOG_DEGRADED_DELAY_COUNT) { + wait(delay(SERVER_KNOBS->TLOG_DEGRADED_DURATION/SERVER_KNOBS->TLOG_DEGRADED_DELAY_COUNT, TaskLowPriority)); + loopCount++; + } + self->degraded->set(true); + return Void(); +} + ACTOR Future doQueueCommit( TLogData* self, Reference logData ) { state Version ver = logData->version.get(); state Version commitNumber = self->queueCommitBegin+1; @@ -1098,7 +1111,9 @@ ACTOR Future doQueueCommit( TLogData* self, Reference logData ) { self->diskQueueCommitBytes = 0; self->largeDiskQueueCommitBytes.set(false); + state Future degraded = watchDegraded(self); wait(c); + degraded.cancel(); wait(self->queueCommitEnd.whenAtLeast(commitNumber-1)); //Calling check_yield instead of yield to avoid a destruction ordering problem in simulation @@ -2052,8 +2067,8 @@ ACTOR Future tLogStart( TLogData* self, InitializeTLogRequest req, Localit } // New tLog (if !recoverFrom.size()) or restore from network -ACTOR Future tLog( IKeyValueStore* persistentData, IDiskQueue* persistentQueue, Reference> db, LocalityData locality, PromiseStream tlogRequests, UID tlogId, bool restoreFromDisk, Promise oldLog, Promise recovered ) { - state TLogData self( tlogId, persistentData, persistentQueue, db ); +ACTOR Future tLog( IKeyValueStore* persistentData, IDiskQueue* persistentQueue, Reference> db, LocalityData locality, PromiseStream tlogRequests, UID tlogId, bool restoreFromDisk, Promise oldLog, Promise recovered, Reference> degraded) { + state TLogData self( tlogId, persistentData, persistentQueue, db, degraded ); state Future error = actorCollection( self.sharedActors.getFuture() ); TraceEvent("SharedTlog", tlogId); diff --git a/fdbserver/TLogServer.actor.cpp b/fdbserver/TLogServer.actor.cpp index 064f9626a0..f8e428fd8c 100644 --- a/fdbserver/TLogServer.actor.cpp +++ b/fdbserver/TLogServer.actor.cpp @@ -299,10 +299,12 @@ struct TLogData : NonCopyable { FlowLock concurrentLogRouterReads; FlowLock persistentDataCommitLock; - TLogData(UID dbgid, IKeyValueStore* persistentData, IDiskQueue * persistentQueue, Reference> const& dbInfo) + Reference> degraded; + + TLogData(UID dbgid, IKeyValueStore* persistentData, IDiskQueue * persistentQueue, Reference> dbInfo, Reference> degraded) : dbgid(dbgid), instanceID(g_random->randomUniqueID().first()), persistentData(persistentData), rawPersistentQueue(persistentQueue), persistentQueue(new TLogQueue(persistentQueue, dbgid)), - dbInfo(dbInfo), queueCommitBegin(0), queueCommitEnd(0), + dbInfo(dbInfo), degraded(degraded), queueCommitBegin(0), queueCommitEnd(0), diskQueueCommitBytes(0), largeDiskQueueCommitBytes(false), bytesInput(0), bytesDurable(0), overheadBytesInput(0), overheadBytesDurable(0), peekMemoryLimiter(SERVER_KNOBS->TLOG_SPILL_REFERENCE_MAX_PEEK_MEMORY_BYTES), concurrentLogRouterReads(SERVER_KNOBS->CONCURRENT_LOG_ROUTER_READS) @@ -1363,6 +1365,17 @@ ACTOR Future tLogPeekMessages( TLogData* self, TLogPeekRequest req, Refere return Void(); } +ACTOR Future watchDegraded(TLogData* self) { + //This delay is divided into multiple delays to avoid marking the tlog as degraded because of a single SlowTask + state int loopCount = 0; + while(loopCount < SERVER_KNOBS->TLOG_DEGRADED_DELAY_COUNT) { + wait(delay(SERVER_KNOBS->TLOG_DEGRADED_DURATION/SERVER_KNOBS->TLOG_DEGRADED_DELAY_COUNT, TaskLowPriority)); + loopCount++; + } + self->degraded->set(true); + return Void(); +} + ACTOR Future doQueueCommit( TLogData* self, Reference logData ) { state Version ver = logData->version.get(); state Version commitNumber = self->queueCommitBegin+1; @@ -1374,7 +1387,9 @@ ACTOR Future doQueueCommit( TLogData* self, Reference logData ) { self->diskQueueCommitBytes = 0; self->largeDiskQueueCommitBytes.set(false); + state Future degraded = watchDegraded(self); wait(c); + degraded.cancel(); wait(self->queueCommitEnd.whenAtLeast(commitNumber-1)); //Calling check_yield instead of yield to avoid a destruction ordering problem in simulation @@ -2328,8 +2343,8 @@ ACTOR Future tLogStart( TLogData* self, InitializeTLogRequest req, Localit } // New tLog (if !recoverFrom.size()) or restore from network -ACTOR Future tLog( IKeyValueStore* persistentData, IDiskQueue* persistentQueue, Reference> db, LocalityData locality, PromiseStream tlogRequests, UID tlogId, bool restoreFromDisk, Promise oldLog, Promise recovered ) { - state TLogData self( tlogId, persistentData, persistentQueue, db ); +ACTOR Future tLog( IKeyValueStore* persistentData, IDiskQueue* persistentQueue, Reference> db, LocalityData locality, PromiseStream tlogRequests, UID tlogId, bool restoreFromDisk, Promise oldLog, Promise recovered, Reference> degraded ) { + state TLogData self( tlogId, persistentData, persistentQueue, db, degraded ); state Future error = actorCollection( self.sharedActors.getFuture() ); TraceEvent("SharedTlog", tlogId); diff --git a/fdbserver/WorkerInterface.actor.h b/fdbserver/WorkerInterface.actor.h index e339706415..93eeb851b1 100644 --- a/fdbserver/WorkerInterface.actor.h +++ b/fdbserver/WorkerInterface.actor.h @@ -366,7 +366,7 @@ ACTOR Future masterProxyServer(MasterProxyInterface proxy, InitializeMaste ACTOR Future tLog(IKeyValueStore* persistentData, IDiskQueue* persistentQueue, Reference> db, LocalityData locality, PromiseStream tlogRequests, UID tlogId, bool restoreFromDisk, - Promise oldLog, Promise recovered); // changes tli->id() to be the recovered ID + Promise oldLog, Promise recovered, Reference> degraded); // changes tli->id() to be the recovered ID ACTOR Future monitorServerDBInfo(Reference>> ccInterface, Reference ccf, LocalityData locality, Reference> dbInfo); @@ -387,7 +387,7 @@ namespace oldTLog_6_0 { ACTOR Future tLog(IKeyValueStore* persistentData, IDiskQueue* persistentQueue, Reference> db, LocalityData locality, PromiseStream tlogRequests, UID tlogId, bool restoreFromDisk, - Promise oldLog, Promise recovered); + Promise oldLog, Promise recovered, Reference> degraded); } typedef decltype(&tLog) TLogFn; diff --git a/fdbserver/worker.actor.cpp b/fdbserver/worker.actor.cpp index c4b0dd0def..d61d67a6a8 100644 --- a/fdbserver/worker.actor.cpp +++ b/fdbserver/worker.actor.cpp @@ -349,22 +349,24 @@ ACTOR Future registrationClient( WorkerInterface interf, Reference> asyncPriorityInfo, ProcessClass initialClass, - Reference>> ddInterf) { + Reference>> ddInterf, + Reference> degraded) { // Keeps the cluster controller (as it may be re-elected) informed that this worker exists // The cluster controller uses waitFailureClient to find out if we die, and returns from registrationReply (requiring us to re-register) // The registration request piggybacks optional distributor interface if it exists. state Generation requestGeneration = 0; state ProcessClass processClass = initialClass; loop { - RegisterWorkerRequest request(interf, initialClass, processClass, asyncPriorityInfo->get(), requestGeneration++, ddInterf->get()); + RegisterWorkerRequest request(interf, initialClass, processClass, asyncPriorityInfo->get(), requestGeneration++, ddInterf->get(), degraded->get()); Future registrationReply = ccInterface->get().present() ? brokenPromiseToNever( ccInterface->get().get().registerWorker.getReply(request) ) : Never(); choose { when ( RegisterWorkerReply reply = wait( registrationReply )) { processClass = reply.processClass; asyncPriorityInfo->set( reply.priorityInfo ); } - when ( wait( ccInterface->onChange() )) { } + when ( wait( ccInterface->onChange() )) {} when ( wait( ddInterf->onChange() ) ) {} + when ( wait( degraded->onChange() ) ) {} } } } @@ -619,6 +621,7 @@ ACTOR Future workerServer( Reference connFile, Refe state WorkerCache storageCache; state Reference> dbInfo( new AsyncVar(ServerDBInfo()) ); state Future metricsLogger; + state Reference> degraded( new AsyncVar(false) ); // tLogFnForOptions() can return a function that doesn't correspond with the FDB version that the // TLogVersion represents. This can be done if the newer TLog doesn't support a requested option. // As (store type, spill type) can map to the same TLogFn across multiple TLogVersions, we need to @@ -735,7 +738,7 @@ ACTOR Future workerServer( Reference connFile, Refe auto& logData = sharedLogs[std::make_tuple(s.tLogOptions.version, s.storeType, s.tLogOptions.spillType)]; // FIXME: Shouldn't if logData.first isValid && !isReady, shouldn't we // be sending a fake InitializeTLogRequest rather than calling tLog() ? - Future tl = tLogFn( kv, queue, dbInfo, locality, !logData.first.isValid() || logData.first.isReady() ? logData.second : PromiseStream(), s.storeID, true, oldLog, recovery ); + Future tl = tLogFn( kv, queue, dbInfo, locality, !logData.first.isValid() || logData.first.isReady() ? logData.second : PromiseStream(), s.storeID, true, oldLog, recovery, degraded ); recoveries.push_back(recovery.getFuture()); tl = handleIOErrors( tl, kv, s.storeID ); @@ -756,7 +759,7 @@ ACTOR Future workerServer( Reference connFile, Refe wait(waitForAll(recoveries)); recoveredDiskFiles.send(Void()); - errorForwarders.add( registrationClient( ccInterface, interf, asyncPriorityInfo, initialClass, ddInterf ) ); + errorForwarders.add( registrationClient( ccInterface, interf, asyncPriorityInfo, initialClass, ddInterf, degraded ) ); TraceEvent("RecoveriesComplete", interf.id()); @@ -868,7 +871,7 @@ ACTOR Future workerServer( Reference connFile, Refe filesClosed.add( data->onClosed() ); filesClosed.add( queue->onClosed() ); - logData.first = tLogFn( data, queue, dbInfo, locality, logData.second, logId, false, Promise(), Promise() ); + logData.first = tLogFn( data, queue, dbInfo, locality, logData.second, logId, false, Promise(), Promise(), degraded ); logData.first = handleIOErrors( logData.first, data, logId ); logData.first = handleIOErrors( logData.first, queue, logId ); errorForwarders.add( forwardError( errors, Role::SHARED_TRANSACTION_LOG, logId, logData.first ) ); From 45fe6b369b055c5d70257f90f6f067f63def79cd Mon Sep 17 00:00:00 2001 From: Evan Tschannen Date: Fri, 8 Mar 2019 14:40:00 -0500 Subject: [PATCH 15/46] tlog recruitment will prefer non-degraded processes, however it will not choose less than desired number of tlogs to avoid degraded processes better master exists will switch the master to avoid degraded processes --- fdbserver/ClusterController.actor.cpp | 129 ++++++++++++++------------ 1 file changed, 71 insertions(+), 58 deletions(-) diff --git a/fdbserver/ClusterController.actor.cpp b/fdbserver/ClusterController.actor.cpp index 938a5fc6b3..e1ba525e2d 100644 --- a/fdbserver/ClusterController.actor.cpp +++ b/fdbserver/ClusterController.actor.cpp @@ -264,7 +264,7 @@ public: } std::vector getWorkersForTlogs( DatabaseConfiguration const& conf, int32_t required, int32_t desired, IRepPolicyRef const& policy, std::map< Optional>, int>& id_used, bool checkStable = false, std::set> dcIds = std::set>() ) { - std::map> fitness_workers; + std::map, vector> fitness_workers; std::vector results; std::vector unavailableLocals; LocalitySetRef logServerSet; @@ -277,7 +277,7 @@ public: for( auto& it : id_worker ) { auto fitness = it.second.details.processClass.machineClassFitness( ProcessClass::TLog ); if( workerAvailable(it.second, checkStable) && !conf.isExcludedServer(it.second.details.interf.address()) && fitness != ProcessClass::NeverAssign && (!dcIds.size() || dcIds.count(it.second.details.interf.locality.dcId())) ) { - fitness_workers[ fitness ].push_back(it.second.details); + fitness_workers[ std::make_pair(fitness,it.second.details.degraded) ].push_back(it.second.details); } else { unavailableLocals.push_back(it.second.details.interf.locality); @@ -285,47 +285,51 @@ public: } results.reserve(results.size() + id_worker.size()); - for (int fitness = ProcessClass::BestFit; fitness != ProcessClass::NeverAssign; fitness ++) + for (int fitness = ProcessClass::BestFit; fitness != ProcessClass::NeverAssign && !bCompleted; fitness++) { auto fitnessEnum = (ProcessClass::Fitness) fitness; - if (fitness_workers.find(fitnessEnum) == fitness_workers.end()) - continue; - for (auto& worker : fitness_workers[(ProcessClass::Fitness) fitness] ) { - logServerMap->add(worker.interf.locality, &worker); - } - if (logServerSet->size() < required) { - TraceEvent(SevWarn,"GWFTADTooFew", id).detail("Fitness", fitness).detail("Processes", logServerSet->size()).detail("Required", required).detail("TLogPolicy", policy->info()).detail("DesiredLogs", desired); - } - else if (logServerSet->size() == required || logServerSet->size() <= desired) { - if (logServerSet->validate(policy)) { - for (auto& object : logServerMap->getObjects()) { - results.push_back(*object); - } - bCompleted = true; - break; + for(int addingDegraded = 0; addingDegraded < 2; addingDegraded++) { + auto workerItr = fitness_workers.find(std::make_pair(fitnessEnum,(bool)addingDegraded)); + if (workerItr == fitness_workers.end()) { + continue; } - TraceEvent(SevWarn,"GWFTADNotAcceptable", id).detail("Fitness", fitness).detail("Processes", logServerSet->size()).detail("Required", required).detail("TLogPolicy",policy->info()).detail("DesiredLogs", desired); - } - // Try to select the desired size, if larger - else { - std::vector bestSet; - std::vector tLocalities; + for (auto& worker : workerItr->second ) { + logServerMap->add(worker.interf.locality, &worker); + } + if (logServerSet->size() < required) { + TraceEvent(SevWarn,"GWFTADTooFew", id).detail("Fitness", fitness).detail("Processes", logServerSet->size()).detail("Required", required).detail("TLogPolicy", policy->info()).detail("DesiredLogs", desired).detail("AddingDegraded", addingDegraded); + } + else if (logServerSet->size() == required || logServerSet->size() <= desired) { + if (logServerSet->validate(policy)) { + for (auto& object : logServerMap->getObjects()) { + results.push_back(*object); + } + bCompleted = true; + break; + } + TraceEvent(SevWarn,"GWFTADNotAcceptable", id).detail("Fitness", fitness).detail("Processes", logServerSet->size()).detail("Required", required).detail("TLogPolicy",policy->info()).detail("DesiredLogs", desired).detail("AddingDegraded", addingDegraded); + } + // Try to select the desired size, if larger + else { + std::vector bestSet; + std::vector tLocalities; - // Try to find the best team of servers to fulfill the policy - if (findBestPolicySet(bestSet, logServerSet, policy, desired, SERVER_KNOBS->POLICY_RATING_TESTS, SERVER_KNOBS->POLICY_GENERATIONS)) { - results.reserve(results.size() + bestSet.size()); - for (auto& entry : bestSet) { - auto object = logServerMap->getObject(entry); - ASSERT(object); - results.push_back(*object); - tLocalities.push_back(object->interf.locality); + // Try to find the best team of servers to fulfill the policy + if (findBestPolicySet(bestSet, logServerSet, policy, desired, SERVER_KNOBS->POLICY_RATING_TESTS, SERVER_KNOBS->POLICY_GENERATIONS)) { + results.reserve(results.size() + bestSet.size()); + for (auto& entry : bestSet) { + auto object = logServerMap->getObject(entry); + ASSERT(object); + results.push_back(*object); + tLocalities.push_back(object->interf.locality); + } + TraceEvent("GWFTADBestResults", id).detail("Fitness", fitness).detail("Processes", logServerSet->size()).detail("BestCount", bestSet.size()).detail("BestZones", ::describeZones(tLocalities)) + .detail("BestDataHalls", ::describeDataHalls(tLocalities)).detail("TLogPolicy", policy->info()).detail("TotalResults", results.size()).detail("DesiredLogs", desired).detail("AddingDegraded", addingDegraded); + bCompleted = true; + break; } - TraceEvent("GWFTADBestResults", id).detail("Fitness", fitness).detail("Processes", logServerSet->size()).detail("BestCount", bestSet.size()).detail("BestZones", ::describeZones(tLocalities)) - .detail("BestDataHalls", ::describeDataHalls(tLocalities)).detail("TLogPolicy", policy->info()).detail("TotalResults", results.size()).detail("DesiredLogs", desired); - bCompleted = true; - break; + TraceEvent(SevWarn,"GWFTADNoBest", id).detail("Fitness", fitness).detail("Processes", logServerSet->size()).detail("Required", required).detail("TLogPolicy", policy->info()).detail("DesiredLogs", desired).detail("AddingDegraded", addingDegraded); } - TraceEvent(SevWarn,"GWFTADNoBest", id).detail("Fitness", fitness).detail("Processes", logServerSet->size()).detail("Required", required).detail("TLogPolicy", policy->info()).detail("DesiredLogs", desired); } } @@ -454,39 +458,44 @@ public: ProcessClass::Fitness worstFit; ProcessClass::ClusterRole role; int count; + bool worstIsDegraded; - RoleFitness(int bestFit, int worstFit, int count, ProcessClass::ClusterRole role) : bestFit((ProcessClass::Fitness)bestFit), worstFit((ProcessClass::Fitness)worstFit), count(count), role(role) {} + RoleFitness(int bestFit, int worstFit, int count, ProcessClass::ClusterRole role) : bestFit((ProcessClass::Fitness)bestFit), worstFit((ProcessClass::Fitness)worstFit), count(count), role(role), worstIsDegraded(false) {} - RoleFitness(int fitness, int count, ProcessClass::ClusterRole role) : bestFit((ProcessClass::Fitness)fitness), worstFit((ProcessClass::Fitness)fitness), count(count), role(role) {} + RoleFitness(int fitness, int count, ProcessClass::ClusterRole role) : bestFit((ProcessClass::Fitness)fitness), worstFit((ProcessClass::Fitness)fitness), count(count), role(role), worstIsDegraded(false) {} - RoleFitness() : bestFit(ProcessClass::NeverAssign), worstFit(ProcessClass::NeverAssign), role(ProcessClass::NoRole), count(0) {} + RoleFitness() : bestFit(ProcessClass::NeverAssign), worstFit(ProcessClass::NeverAssign), role(ProcessClass::NoRole), count(0), worstIsDegraded(false) {} - RoleFitness(RoleFitness first, RoleFitness second, ProcessClass::ClusterRole role) : bestFit(std::min(first.worstFit, second.worstFit)), worstFit(std::max(first.worstFit, second.worstFit)), count(first.count + second.count), role(role) { } + RoleFitness(RoleFitness first, RoleFitness second, ProcessClass::ClusterRole role) : bestFit(std::min(first.worstFit, second.worstFit)), worstFit(std::max(first.worstFit, second.worstFit)), count(first.count + second.count), role(role) { + if(first.worstFit > second.worstFit) { + worstIsDegraded = first.worstIsDegraded; + } else if(second.worstFit > first.worstFit) { + worstIsDegraded = second.worstIsDegraded; + } else { + worstIsDegraded = first.worstIsDegraded || second.worstIsDegraded; + } + } RoleFitness( vector workers, ProcessClass::ClusterRole role ) : role(role) { worstFit = ProcessClass::BestFit; + worstIsDegraded = false; bestFit = ProcessClass::NeverAssign; - for(auto it : workers) { + for(auto& it : workers) { auto thisFit = it.processClass.machineClassFitness( role ); - worstFit = std::max(worstFit, thisFit); + if(thisFit > worstFit) { + worstFit = thisFit; + worstIsDegraded = it.degraded; + } else if(thisFit == worstFit) { + worstIsDegraded = worstIsDegraded || it.degraded; + } bestFit = std::min(bestFit, thisFit); } count = workers.size(); } - RoleFitness( std::vector classes, ProcessClass::ClusterRole role ) : role(role) { - worstFit = ProcessClass::BestFit; - bestFit = ProcessClass::NeverAssign; - for(auto it : classes) { - auto thisFit = it.machineClassFitness( role ); - worstFit = std::max(worstFit, thisFit); - bestFit = std::min(bestFit, thisFit); - } - count = classes.size(); - } - bool operator < (RoleFitness const& r) const { if (worstFit != r.worstFit) return worstFit < r.worstFit; + if (worstIsDegraded != r.worstIsDegraded) return r.worstIsDegraded; // FIXME: TLog recruitment process does not guarantee the best fit is not worsened. if (role != ProcessClass::TLog && role != ProcessClass::LogRouter && bestFit != r.bestFit) return bestFit < r.bestFit; return count > r.count; @@ -494,18 +503,21 @@ public: bool betterFitness (RoleFitness const& r) const { if (worstFit != r.worstFit) return worstFit < r.worstFit; + if (worstIsDegraded != r.worstIsDegraded) return r.worstFit; if (bestFit != r.bestFit) return bestFit < r.bestFit; return false; } bool betterCount (RoleFitness const& r) const { if(count > r.count) return true; - return worstFit < r.worstFit; + if(worstFit != r.worstFit) return worstFit < r.worstFit; + if (worstIsDegraded != r.worstIsDegraded) return r.worstFit; + return false; } - bool operator == (RoleFitness const& r) const { return worstFit == r.worstFit && bestFit == r.bestFit && count == r.count; } + bool operator == (RoleFitness const& r) const { return worstFit == r.worstFit && bestFit == r.bestFit && count == r.count && worstIsDegraded == r.worstIsDegraded; } - std::string toString() const { return format("%d %d %d", bestFit, worstFit, count); } + std::string toString() const { return format("%d %d %d %d", bestFit, worstFit, count, worstIsDegraded); } }; std::set>> getDatacenters( DatabaseConfiguration const& conf, bool checkStable = false ) { @@ -1753,7 +1765,7 @@ void registerWorker( RegisterWorkerRequest req, ClusterControllerData *self ) { self->db.setDistributor( di ); } if( info == self->id_worker.end() ) { - self->id_worker[w.locality.processId()] = WorkerInfo( workerAvailabilityWatch( w, newProcessClass, self ), req.reply, req.generation, w, req.initialClass, newProcessClass, newPriorityInfo, false ); + self->id_worker[w.locality.processId()] = WorkerInfo( workerAvailabilityWatch( w, newProcessClass, self ), req.reply, req.generation, w, req.initialClass, newProcessClass, newPriorityInfo, req.degraded ); checkOutstandingRequests( self ); return; } @@ -1766,6 +1778,7 @@ void registerWorker( RegisterWorkerRequest req, ClusterControllerData *self ) { info->second.details.processClass = newProcessClass; info->second.priorityInfo = newPriorityInfo; info->second.initialClass = req.initialClass; + info->second.details.degraded = req.degraded; info->second.gen = req.generation; if(info->second.details.interf.id() != w.id()) { From 41c493f8d493c80d459208199a3e1255b5b016b4 Mon Sep 17 00:00:00 2001 From: Evan Tschannen Date: Fri, 8 Mar 2019 14:40:32 -0500 Subject: [PATCH 16/46] fix: connectPacket accessed uninitialized variables --- fdbrpc/FlowTransport.actor.cpp | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/fdbrpc/FlowTransport.actor.cpp b/fdbrpc/FlowTransport.actor.cpp index c1490ed9af..e38a5bfd2f 100644 --- a/fdbrpc/FlowTransport.actor.cpp +++ b/fdbrpc/FlowTransport.actor.cpp @@ -215,6 +215,10 @@ struct ConnectPacket { uint16_t flags; uint8_t canonicalRemoteIp6[16]; + ConnectPacket() { + memset(this, 0, sizeof(*this)); + } + IPAddress canonicalRemoteIp() const { if (isIPv6()) { IPAddress::IPAddressStore store; From 1be9ae5ce3d4482bafcc9b2a822a301106ec15b1 Mon Sep 17 00:00:00 2001 From: Evan Tschannen Date: Fri, 8 Mar 2019 22:51:06 -0500 Subject: [PATCH 17/46] fixed merge conflict --- fdbserver/ClusterController.actor.cpp | 8 +------- 1 file changed, 1 insertion(+), 7 deletions(-) diff --git a/fdbserver/ClusterController.actor.cpp b/fdbserver/ClusterController.actor.cpp index 65a78b29cd..da3f15a602 100644 --- a/fdbserver/ClusterController.actor.cpp +++ b/fdbserver/ClusterController.actor.cpp @@ -59,15 +59,9 @@ struct WorkerInfo : NonCopyable { WorkerInfo( Future watcher, ReplyPromise reply, Generation gen, WorkerInterface interf, ProcessClass initialClass, ProcessClass processClass, ClusterControllerPriorityInfo priorityInfo, bool degraded ) : watcher(watcher), reply(reply), gen(gen), reboots(0), lastAvailableTime(now()), initialClass(initialClass), priorityInfo(priorityInfo), details(interf, processClass, degraded) {} -<<<<<<< HEAD - WorkerInfo( WorkerInfo&& r ) noexcept(true) : watcher(std::move(r.watcher)), reply(std::move(r.reply)), gen(r.gen), - reboots(r.reboots), lastAvailableTime(r.lastAvailableTime), initialClass(r.initialClass), priorityInfo(r.priorityInfo), details(std::move(r.details)) {} - void operator=( WorkerInfo&& r ) noexcept(true) { -======= WorkerInfo( WorkerInfo&& r ) BOOST_NOEXCEPT : watcher(std::move(r.watcher)), reply(std::move(r.reply)), gen(r.gen), - reboots(r.reboots), lastAvailableTime(r.lastAvailableTime), interf(std::move(r.interf)), initialClass(r.initialClass), processClass(r.processClass), priorityInfo(r.priorityInfo) {} + reboots(r.reboots), lastAvailableTime(r.lastAvailableTime), initialClass(r.initialClass), priorityInfo(r.priorityInfo), details(std::move(r.details)) {} void operator=( WorkerInfo&& r ) BOOST_NOEXCEPT { ->>>>>>> master watcher = std::move(r.watcher); reply = std::move(r.reply); gen = r.gen; From c6e94293bfeccc9015a57b438276838da27ec850 Mon Sep 17 00:00:00 2001 From: Evan Tschannen Date: Sun, 10 Mar 2019 22:39:21 -0700 Subject: [PATCH 18/46] reset a process to not be degraded after 2 days --- fdbserver/Knobs.cpp | 1 + fdbserver/Knobs.h | 1 + fdbserver/worker.actor.cpp | 1 + flow/genericactors.actor.h | 22 ++++++++++++++++++++++ 4 files changed, 25 insertions(+) diff --git a/fdbserver/Knobs.cpp b/fdbserver/Knobs.cpp index 1d94f56110..72f2b591b3 100644 --- a/fdbserver/Knobs.cpp +++ b/fdbserver/Knobs.cpp @@ -76,6 +76,7 @@ ServerKnobs::ServerKnobs(bool randomize, ClientKnobs* clientKnobs) { init( DISK_QUEUE_FILE_SHRINK_BYTES, 100<<20 ); // BUGGIFYd per file within the DiskQueue init( TLOG_DEGRADED_DELAY_COUNT, 5 ); init( TLOG_DEGRADED_DURATION, 5.0 ); + init( TLOG_DEGRADED_RESET_INTERVAL, 48*60*60 ); if ( randomize && BUGGIFY ) TLOG_DEGRADED_RESET_INTERVAL = 10; // Data distribution queue init( HEALTH_POLL_TIME, 1.0 ); diff --git a/fdbserver/Knobs.h b/fdbserver/Knobs.h index f3184b3345..7d55141a7d 100644 --- a/fdbserver/Knobs.h +++ b/fdbserver/Knobs.h @@ -79,6 +79,7 @@ public: int64_t DISK_QUEUE_FILE_SHRINK_BYTES; // When we shrink the disk queue, by how many bytes should it shrink? int TLOG_DEGRADED_DELAY_COUNT; double TLOG_DEGRADED_DURATION; + double TLOG_DEGRADED_RESET_INTERVAL; // Data distribution queue double HEALTH_POLL_TIME; diff --git a/fdbserver/worker.actor.cpp b/fdbserver/worker.actor.cpp index d61d67a6a8..9530a90103 100644 --- a/fdbserver/worker.actor.cpp +++ b/fdbserver/worker.actor.cpp @@ -646,6 +646,7 @@ ACTOR Future workerServer( Reference connFile, Refe } } + errorForwarders.add( resetAfter(degraded, SERVER_KNOBS->TLOG_DEGRADED_RESET_INTERVAL, false)); errorForwarders.add( loadedPonger( interf.debugPing.getFuture() ) ); errorForwarders.add( waitFailureServer( interf.waitFailure.getFuture() ) ); errorForwarders.add( monitorServerDBInfo( ccInterface, connFile, locality, dbInfo ) ); diff --git a/flow/genericactors.actor.h b/flow/genericactors.actor.h index b591137c4a..f1a5bb8d53 100644 --- a/flow/genericactors.actor.h +++ b/flow/genericactors.actor.h @@ -775,6 +775,28 @@ Future setAfter( Reference> var, double time, T val ) { return Void(); } +ACTOR template +Future resetAfter( Reference> var, double time, T val ) { + state bool isEqual = var->get() == val; + state Future resetDelay = isEqual ? Never() : delay(time); + loop { + choose { + when( wait( resetDelay ) { + var->set( val ); + } + when( wait( var->onChange() ) ) {} + } + if( isEqual && var->get() != val ) { + isEqual = false; + resetDelay = delay(time); + } + if( !isEqual && var->get() == val ) { + isEqual = true; + resetDelay = Never(); + } + } +} + ACTOR template Future setWhenDoneOrError( Future condition, Reference> var, T val ) { try { From 2ff37f49dab8f01233a4d7cfaac38e5fdc91e588 Mon Sep 17 00:00:00 2001 From: Evan Tschannen Date: Sun, 10 Mar 2019 22:56:12 -0700 Subject: [PATCH 19/46] fix: compiler error --- flow/genericactors.actor.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/flow/genericactors.actor.h b/flow/genericactors.actor.h index f1a5bb8d53..8570553341 100644 --- a/flow/genericactors.actor.h +++ b/flow/genericactors.actor.h @@ -781,7 +781,7 @@ Future resetAfter( Reference> var, double time, T val ) { state Future resetDelay = isEqual ? Never() : delay(time); loop { choose { - when( wait( resetDelay ) { + when( wait( resetDelay ) ) { var->set( val ); } when( wait( var->onChange() ) ) {} From 80c3f2f8e29c4292364e630e504a4d3239979b44 Mon Sep 17 00:00:00 2001 From: Evan Tschannen Date: Sun, 10 Mar 2019 22:58:15 -0700 Subject: [PATCH 20/46] added status fields detailing which processes are degraded, and also the total number of degraded processes --- fdbclient/Schemas.cpp | 2 ++ fdbserver/OldTLogServer_6_0.actor.cpp | 1 + fdbserver/Status.actor.cpp | 12 +++++++++++- fdbserver/TLogServer.actor.cpp | 1 + 4 files changed, 15 insertions(+), 1 deletion(-) diff --git a/fdbclient/Schemas.cpp b/fdbclient/Schemas.cpp index 769d7e8d25..89d91eb6d7 100644 --- a/fdbclient/Schemas.cpp +++ b/fdbclient/Schemas.cpp @@ -52,6 +52,7 @@ const KeyRef JSONSchemas::statusSchema = LiteralStringRef(R"statusSchema( "test" ] }, + "degraded":true, "roles":[ { "query_queue_max":0, @@ -281,6 +282,7 @@ const KeyRef JSONSchemas::statusSchema = LiteralStringRef(R"statusSchema( ], "datacenter_version_difference":0, + "degraded_processes":0, "database_available":true, "database_locked":false, "generation":2, diff --git a/fdbserver/OldTLogServer_6_0.actor.cpp b/fdbserver/OldTLogServer_6_0.actor.cpp index 2dc28e17c7..8386d4828d 100644 --- a/fdbserver/OldTLogServer_6_0.actor.cpp +++ b/fdbserver/OldTLogServer_6_0.actor.cpp @@ -1096,6 +1096,7 @@ ACTOR Future watchDegraded(TLogData* self) { wait(delay(SERVER_KNOBS->TLOG_DEGRADED_DURATION/SERVER_KNOBS->TLOG_DEGRADED_DELAY_COUNT, TaskLowPriority)); loopCount++; } + TraceEvent(SevWarnAlways, "TLogDegraded", self->dbgid); self->degraded->set(true); return Void(); } diff --git a/fdbserver/Status.actor.cpp b/fdbserver/Status.actor.cpp index 50b94237ed..d64b980e37 100644 --- a/fdbserver/Status.actor.cpp +++ b/fdbserver/Status.actor.cpp @@ -796,7 +796,9 @@ ACTOR static Future processStatusFetcher( statusObj["class_type"] = workerItr->processClass.toString(); statusObj["class_source"] = workerItr->processClass.sourceString(); - + if(workerItr->degraded) { + statusObj["degraded"] = true; + } } catch (Error& e){ // Something strange occurred, process list is incomplete but what was built so far, if anything, will be returned. @@ -2033,6 +2035,14 @@ ACTOR Future clusterGetStatus( statusObj["incompatible_connections"] = incompatibleConnectionsArray; statusObj["datacenter_version_difference"] = datacenterVersionDifference; + int totalDegraded = 0; + for(auto& it : workers) { + if(it.degraded) { + totalDegraded++; + } + } + statusObj["degraded_processes"] = totalDegraded; + if (!recoveryStateStatus.empty()) statusObj["recovery_state"] = recoveryStateStatus; diff --git a/fdbserver/TLogServer.actor.cpp b/fdbserver/TLogServer.actor.cpp index dd70693202..f197a2aacc 100644 --- a/fdbserver/TLogServer.actor.cpp +++ b/fdbserver/TLogServer.actor.cpp @@ -1372,6 +1372,7 @@ ACTOR Future watchDegraded(TLogData* self) { wait(delay(SERVER_KNOBS->TLOG_DEGRADED_DURATION/SERVER_KNOBS->TLOG_DEGRADED_DELAY_COUNT, TaskLowPriority)); loopCount++; } + TraceEvent(SevWarnAlways, "TLogDegraded", self->dbgid); self->degraded->set(true); return Void(); } From 5873705228fbb57b4ddaca709a2f13d23884cb54 Mon Sep 17 00:00:00 2001 From: Evan Tschannen Date: Mon, 11 Mar 2019 12:11:17 -0700 Subject: [PATCH 21/46] tlog commits very rarely take an additional 6 seconds --- fdbserver/OldTLogServer_6_0.actor.cpp | 5 +++++ fdbserver/TLogServer.actor.cpp | 5 +++++ 2 files changed, 10 insertions(+) diff --git a/fdbserver/OldTLogServer_6_0.actor.cpp b/fdbserver/OldTLogServer_6_0.actor.cpp index 8386d4828d..17af19929b 100644 --- a/fdbserver/OldTLogServer_6_0.actor.cpp +++ b/fdbserver/OldTLogServer_6_0.actor.cpp @@ -33,6 +33,7 @@ #include "fdbrpc/FailureMonitor.h" #include "fdbserver/IDiskQueue.h" #include "fdbrpc/sim_validation.h" +#include "fdbrpc/simulator.h" #include "fdbserver/ServerDBInfo.h" #include "fdbserver/LogSystem.h" #include "fdbserver/WaitFailure.h" @@ -1097,6 +1098,7 @@ ACTOR Future watchDegraded(TLogData* self) { loopCount++; } TraceEvent(SevWarnAlways, "TLogDegraded", self->dbgid); + TEST(true); //6.0 TLog degraded self->degraded->set(true); return Void(); } @@ -1114,6 +1116,9 @@ ACTOR Future doQueueCommit( TLogData* self, Reference logData ) { state Future degraded = watchDegraded(self); wait(c); + if(g_network->isSimulated() && !g_simulator.speedUpSimulation && BUGGIFY_WITH_PROB(0.0001)) { + wait(delay(6.0)); + } degraded.cancel(); wait(self->queueCommitEnd.whenAtLeast(commitNumber-1)); diff --git a/fdbserver/TLogServer.actor.cpp b/fdbserver/TLogServer.actor.cpp index f197a2aacc..dc1deec8ce 100644 --- a/fdbserver/TLogServer.actor.cpp +++ b/fdbserver/TLogServer.actor.cpp @@ -33,6 +33,7 @@ #include "fdbrpc/FailureMonitor.h" #include "fdbserver/IDiskQueue.h" #include "fdbrpc/sim_validation.h" +#include "fdbrpc/simulator.h" #include "fdbserver/ServerDBInfo.h" #include "fdbserver/LogSystem.h" #include "fdbserver/WaitFailure.h" @@ -1373,6 +1374,7 @@ ACTOR Future watchDegraded(TLogData* self) { loopCount++; } TraceEvent(SevWarnAlways, "TLogDegraded", self->dbgid); + TEST(true); //TLog degraded self->degraded->set(true); return Void(); } @@ -1390,6 +1392,9 @@ ACTOR Future doQueueCommit( TLogData* self, Reference logData ) { state Future degraded = watchDegraded(self); wait(c); + if(g_network->isSimulated() && !g_simulator.speedUpSimulation && BUGGIFY_WITH_PROB(0.0001)) { + wait(delay(6.0)); + } degraded.cancel(); wait(self->queueCommitEnd.whenAtLeast(commitNumber-1)); From 8504bd6c9f71f5afd80e4e5b6a9c67e2969fd05e Mon Sep 17 00:00:00 2001 From: Jingyu Zhou Date: Mon, 11 Mar 2019 15:08:33 -0700 Subject: [PATCH 22/46] Update release notes. --- documentation/sphinx/source/release-notes.rst | 2 ++ 1 file changed, 2 insertions(+) diff --git a/documentation/sphinx/source/release-notes.rst b/documentation/sphinx/source/release-notes.rst index d0f1b945c5..1ffbfefe85 100644 --- a/documentation/sphinx/source/release-notes.rst +++ b/documentation/sphinx/source/release-notes.rst @@ -16,6 +16,8 @@ Features * Batch priority transactions are now limited separately by ratekeeper and will be throttled at lower levels of cluster saturation. This makes it possible to run a more intense background load at saturation without significantly affecting normal priority transactions. It is still recommended not to run excessive loads at batch priority. `(PR #1198) `_ * Restore now requires the destnation cluster to be specified explicitly to avoid confusion. `(PR #1240) `_ * Restore target version can now be specified by timestamp if the original cluster is available. `(PR #1240) `_ +* Separate data distribution out from master as a new role. `(PR #1062) `_ +* Separate rate keeper out from data distribution as a new role. `(PR ##1176) `_ Performance ----------- From 78ff3d92c1450322cc56627743f657158fcd393c Mon Sep 17 00:00:00 2001 From: Alec Grieser Date: Sun, 17 Feb 2019 10:09:42 -0800 Subject: [PATCH 23/46] memoize the packed Tuple representation --- bindings/java/CMakeLists.txt | 1 + .../tuple/IterableComparator.java | 2 +- .../com/apple/foundationdb/tuple/Tuple.java | 156 ++++-- .../apple/foundationdb/tuple/TupleUtil.java | 458 ++++++++++-------- .../test/TuplePerformanceTest.java | 12 +- 5 files changed, 373 insertions(+), 256 deletions(-) diff --git a/bindings/java/CMakeLists.txt b/bindings/java/CMakeLists.txt index 8a67e8f08a..93e7e7ea8e 100644 --- a/bindings/java/CMakeLists.txt +++ b/bindings/java/CMakeLists.txt @@ -89,6 +89,7 @@ set(JAVA_TESTS_SRCS src/test/com/apple/foundationdb/test/TesterArgs.java src/test/com/apple/foundationdb/test/TestResult.java src/test/com/apple/foundationdb/test/TupleTest.java + src/test/com/apple/foundationdb/test/TuplePerformanceTest.java src/test/com/apple/foundationdb/test/VersionstampSmokeTest.java src/test/com/apple/foundationdb/test/WatchTest.java src/test/com/apple/foundationdb/test/WhileTrueTest.java) diff --git a/bindings/java/src/main/com/apple/foundationdb/tuple/IterableComparator.java b/bindings/java/src/main/com/apple/foundationdb/tuple/IterableComparator.java index 71aa23e9b1..1587b3fd6e 100644 --- a/bindings/java/src/main/com/apple/foundationdb/tuple/IterableComparator.java +++ b/bindings/java/src/main/com/apple/foundationdb/tuple/IterableComparator.java @@ -34,7 +34,7 @@ import java.util.Iterator; * tuple1.compareTo(tuple2) * == new IterableComparator().compare(tuple1, tuple2) * == new IterableComparator().compare(tuple1.getItems(), tuple2.getItems()), - * == ByteArrayUtil.compareUnsigned(tuple1.pack(), tuple2.pack())} + * == ByteArrayUtil.compareUnsigned(tuple1.packInternal(), tuple2.packInternal())} * * *

diff --git a/bindings/java/src/main/com/apple/foundationdb/tuple/Tuple.java b/bindings/java/src/main/com/apple/foundationdb/tuple/Tuple.java index 557432d4e3..7b14632452 100644 --- a/bindings/java/src/main/com/apple/foundationdb/tuple/Tuple.java +++ b/bindings/java/src/main/com/apple/foundationdb/tuple/Tuple.java @@ -68,10 +68,11 @@ import com.apple.foundationdb.Range; * This class is not thread safe. */ public class Tuple implements Comparable, Iterable { - private static IterableComparator comparator = new IterableComparator(); + private static final IterableComparator comparator = new IterableComparator(); private List elements; private int memoizedHash = 0; + private byte[] packed = null; private Tuple(List elements, Object newItem) { this(elements); @@ -82,6 +83,12 @@ public class Tuple implements Comparable, Iterable { this.elements = new ArrayList<>(elements); } + private enum VersionstampExpectations { + UNKNOWN, + HAS_INCOMPLETE, + HAS_NO_INCOMPLETE + } + /** * Creates a copy of this {@code Tuple} with an appended last element. The parameter * is untyped but only {@link String}, {@code byte[]}, {@link Number}s, {@link UUID}s, @@ -261,7 +268,7 @@ public class Tuple implements Comparable, Iterable { * @return a newly created {@code Tuple} */ public Tuple addAll(List o) { - List merged = new ArrayList(o.size() + this.elements.size()); + List merged = new ArrayList<>(o.size() + this.elements.size()); merged.addAll(this.elements); merged.addAll(o); return new Tuple(merged); @@ -275,7 +282,7 @@ public class Tuple implements Comparable, Iterable { * @return a newly created {@code Tuple} */ public Tuple addAll(Tuple other) { - List merged = new ArrayList(this.size() + other.size()); + List merged = new ArrayList<>(this.size() + other.size()); merged.addAll(this.elements); merged.addAll(other.peekItems()); return new Tuple(merged); @@ -285,10 +292,10 @@ public class Tuple implements Comparable, Iterable { * Get an encoded representation of this {@code Tuple}. Each element is encoded to * {@code byte}s and concatenated. * - * @return a serialized representation of this {@code Tuple}. + * @return a packed representation of this {@code Tuple}. */ public byte[] pack() { - return pack(null); + return packInternal(null, true); } /** @@ -296,11 +303,36 @@ public class Tuple implements Comparable, Iterable { * {@code byte}s and concatenated, and then the prefix supplied is prepended to * the array. * - * @param prefix additional byte-array prefix to prepend to serialized bytes. - * @return a serialized representation of this {@code Tuple} prepended by the {@code prefix}. + * @param prefix additional byte-array prefix to prepend to packed bytes. + * @return a packed representation of this {@code Tuple} prepended by the {@code prefix}. */ public byte[] pack(byte[] prefix) { - return TupleUtil.pack(elements, prefix); + return packInternal(prefix, true); + } + + byte[] packInternal(byte[] prefix, boolean copy) { + boolean hasPrefix = prefix != null && prefix.length > 1; + if(packed == null) { + byte[] result = TupleUtil.pack(elements, prefix); + if(hasPrefix) { + packed = Arrays.copyOfRange(result, prefix.length, result.length); + return result; + } + else { + packed = result; + } + } + if(hasPrefix) { + return ByteArrayUtil.join(prefix, packed); + } + else { + if(copy) { + return Arrays.copyOf(packed, packed.length); + } + else { + return packed; + } + } } /** @@ -309,7 +341,7 @@ public class Tuple implements Comparable, Iterable { * This works the same as the {@link #packWithVersionstamp(byte[]) one-paramter version of this method}, * but it does not add any prefix to the array. * - * @return a serialized representation of this {@code Tuple} for use with versionstamp ops. + * @return a packed representation of this {@code Tuple} for use with versionstamp ops. * @throws IllegalArgumentException if there is not exactly one incomplete {@link Versionstamp} included in this {@code Tuple} */ public byte[] packWithVersionstamp() { @@ -322,28 +354,71 @@ public class Tuple implements Comparable, Iterable { * There must be exactly one incomplete {@link Versionstamp} instance within this * {@code Tuple} or this will throw an {@link IllegalArgumentException}. * Each element is encoded to {@code byte}s and concatenated, the prefix - * is then prepended to the array, and then the index of the serialized incomplete + * is then prepended to the array, and then the index of the packed incomplete * {@link Versionstamp} is appended as a little-endian integer. This can then be passed * as the key to * {@link com.apple.foundationdb.Transaction#mutate(com.apple.foundationdb.MutationType, byte[], byte[]) Transaction.mutate()} * with the {@code SET_VERSIONSTAMPED_KEY} {@link com.apple.foundationdb.MutationType}, and the transaction's * version will then be filled in at commit time. * - * @param prefix additional byte-array prefix to prepend to serialized bytes. - * @return a serialized representation of this {@code Tuple} for use with versionstamp ops. + * @param prefix additional byte-array prefix to prepend to packed bytes. + * @return a packed representation of this {@code Tuple} for use with versionstamp ops. * @throws IllegalArgumentException if there is not exactly one incomplete {@link Versionstamp} included in this {@code Tuple} */ public byte[] packWithVersionstamp(byte[] prefix) { return TupleUtil.packWithVersionstamp(elements, prefix); } + byte[] packWithVersionstampInternal(byte[] prefix, boolean copy) { + boolean hasPrefix = prefix != null && prefix.length > 0; + if(packed == null) { + byte[] result = TupleUtil.packWithVersionstamp(elements, prefix); + if(hasPrefix) { + byte[] withoutPrefix = Arrays.copyOfRange(result, prefix.length, result.length); + TupleUtil.adjustVersionPosition(packed, -1 * prefix.length); + packed = withoutPrefix; + return result; + } + else { + packed = result; + } + } + if(hasPrefix) { + byte[] withPrefix = ByteArrayUtil.join(prefix, packed); + TupleUtil.adjustVersionPosition(withPrefix, prefix.length); + return withPrefix; + } + else { + if(copy) { + return Arrays.copyOf(packed, packed.length); + } + else { + return packed; + } + } + } + + byte[] packMaybeVersionstamp(byte[] prefix) { + if(packed == null) { + if(hasIncompleteVersionstamp()) { + return packWithVersionstampInternal(prefix, false); + } + else { + return packInternal(prefix, false); + } + } + else { + return packed; + } + } + /** * Gets the unserialized contents of this {@code Tuple}. * * @return the elements that make up this {@code Tuple}. */ public List getItems() { - return new ArrayList(elements); + return new ArrayList<>(elements); } /** @@ -385,7 +460,7 @@ public class Tuple implements Comparable, Iterable { * @see #fromItems(Iterable) */ public Tuple() { - this.elements = new LinkedList(); + this.elements = new LinkedList<>(); } /** @@ -413,6 +488,7 @@ public class Tuple implements Comparable, Iterable { public static Tuple fromBytes(byte[] bytes, int offset, int length) { Tuple t = new Tuple(); t.elements = TupleUtil.unpack(bytes, offset, length); + t.packed = Arrays.copyOfRange(bytes, offset, offset + length); return t; } @@ -623,13 +699,14 @@ public class Tuple implements Comparable, Iterable { Object o = this.elements.get(index); if(o == null) { return null; - } else if(o instanceof Tuple) { + } + else if(o instanceof Tuple) { return ((Tuple)o).getItems(); - } else if(o instanceof List) { - List ret = new LinkedList(); - ret.addAll((List)o); - return ret; - } else { + } + else if(o instanceof List) { + return new ArrayList<>((List) o); + } + else { throw new ClassCastException("Cannot convert item of type " + o.getClass() + " to list"); } } @@ -678,11 +755,10 @@ public class Tuple implements Comparable, Iterable { * @throws IllegalStateException if this {@code Tuple} is empty */ public Tuple popFront() { - if(elements.size() == 0) + if(elements.isEmpty()) throw new IllegalStateException("Tuple contains no elements"); - - List items = new ArrayList(elements.size() - 1); + List items = new ArrayList<>(elements.size() - 1); for(int i = 1; i < this.elements.size(); i++) { items.add(this.elements.get(i)); } @@ -697,11 +773,10 @@ public class Tuple implements Comparable, Iterable { * @throws IllegalStateException if this {@code Tuple} is empty */ public Tuple popBack() { - if(elements.size() == 0) + if(elements.isEmpty()) throw new IllegalStateException("Tuple contains no elements"); - - List items = new ArrayList(elements.size() - 1); + List items = new ArrayList<>(elements.size() - 1); for(int i = 0; i < this.elements.size() - 1; i++) { items.add(this.elements.get(i)); } @@ -718,12 +793,18 @@ public class Tuple implements Comparable, Iterable { * Tuple t = Tuple.from("a", "b"); * Range r = t.range(); * {@code r} includes all tuples ("a", "b", ...) + *
+ * This function will throw an error if this {@code Tuple} contains an incomplete + * {@link Versionstamp}. * * @return the range of keys containing all {@code Tuple}s that have this {@code Tuple} * as a prefix */ public Range range() { - byte[] p = pack(); + if(hasIncompleteVersionstamp()) { + throw new IllegalStateException("Tuple with incomplete versionstamp used for range"); + } + byte[] p = packInternal(null, false); //System.out.println("Packed tuple is: " + ByteArrayUtil.printable(p)); return new Range(ByteArrayUtil.join(p, new byte[] {0x0}), ByteArrayUtil.join(p, new byte[] {(byte)0xff})); @@ -742,6 +823,16 @@ public class Tuple implements Comparable, Iterable { return TupleUtil.hasIncompleteVersionstamp(stream()); } + /** + * Get the number of bytes in the packed representation of this {@code Tuple}. + * + * @return + */ + public int getPackedSize() { + byte[] p = packMaybeVersionstamp(null); + return p.length; + } + /** * Compare the byte-array representation of this {@code Tuple} against another. This method * will sort {@code Tuple}s in the same order that they would be sorted as keys in @@ -772,14 +863,7 @@ public class Tuple implements Comparable, Iterable { @Override public int hashCode() { if(memoizedHash == 0) { - byte[] packed; - if(hasIncompleteVersionstamp()) { - packed = packWithVersionstamp(null); - } - else { - packed = pack(); - } - memoizedHash = Arrays.hashCode(packed); + memoizedHash = Arrays.hashCode(packMaybeVersionstamp(null)); } return memoizedHash; } @@ -1011,7 +1095,7 @@ public class Tuple implements Comparable, Iterable { } private static Tuple createTuple(int items) { - List elements = new ArrayList(items); + List elements = new ArrayList<>(items); for(int i = 0; i < items; i++) { elements.add(new byte[]{99}); } diff --git a/bindings/java/src/main/com/apple/foundationdb/tuple/TupleUtil.java b/bindings/java/src/main/com/apple/foundationdb/tuple/TupleUtil.java index cf1d337f2e..f25828f47d 100644 --- a/bindings/java/src/main/com/apple/foundationdb/tuple/TupleUtil.java +++ b/bindings/java/src/main/com/apple/foundationdb/tuple/TupleUtil.java @@ -28,7 +28,6 @@ import java.util.ArrayList; import java.util.Arrays; import java.util.Collection; import java.util.Collections; -import java.util.LinkedList; import java.util.List; import java.util.UUID; import java.util.stream.Stream; @@ -73,22 +72,45 @@ class TupleUtil { } static class DecodeResult { - final int end; - final Object o; + final List values; + int end; - DecodeResult(int pos, Object o) { - this.end = pos; - this.o = o; + DecodeResult() { + values = new ArrayList<>(); + end = 0; + } + + void add(Object value, int end) { + values.add(value); + this.end = end; } } static class EncodeResult { - final int totalLength; - final int versionPos; + final List encodedValues; + int totalLength; + int versionPos; - EncodeResult(int totalLength, int versionPos) { - this.totalLength = totalLength; + EncodeResult(int capacity) { + this.encodedValues = new ArrayList<>(capacity); + totalLength = 0; + versionPos = -1; + } + + EncodeResult add(byte[] encoded, int versionPos) { + if(versionPos >= 0 && this.versionPos >= 0) { + throw new IllegalArgumentException("Multiple incomplete Versionstamps included in Tuple"); + } + encodedValues.add(encoded); + totalLength += encoded.length; this.versionPos = versionPos; + return this; + } + + EncodeResult add(byte[] encoded) { + encodedValues.add(encoded); + totalLength += encoded.length; + return this; } } @@ -129,10 +151,44 @@ class TupleUtil { return bytes; } - public static byte[] join(List items) { + static byte[] join(List items) { return ByteArrayUtil.join(null, items); } + private static void adjustVersionPosition300(byte[] packed, int delta) { + int offsetOffset = packed.length - Short.BYTES; + ByteBuffer buffer = ByteBuffer.wrap(packed, offsetOffset, Short.BYTES).order(ByteOrder.LITTLE_ENDIAN); + int versionPosition = buffer.getShort() + delta; + if(versionPosition > 0xffff) { + throw new IllegalArgumentException("Tuple has incomplete version at position " + versionPosition + " which is greater than the maximum " + 0xffff); + } + if(versionPosition < 0) { + throw new IllegalArgumentException("Tuple has an incomplete version at a negative position"); + } + buffer.position(offsetOffset); + buffer.putShort((short)versionPosition); + } + + private static void adjustVersionPosition520(byte[] packed, int delta) { + int offsetOffset = packed.length - Integer.BYTES; + ByteBuffer buffer = ByteBuffer.wrap(packed, offsetOffset, Integer.BYTES).order(ByteOrder.LITTLE_ENDIAN); + int versionPosition = buffer.getInt() + delta; + if(versionPosition < 0) { + throw new IllegalArgumentException("Tuple has an incomplete version at a negative position"); + } + buffer.position(offsetOffset); + buffer.putInt(versionPosition); + } + + static void adjustVersionPosition(byte[] packed, int delta) { + if(FDB.instance().getAPIVersion() < 520) { + adjustVersionPosition300(packed, delta); + } + else { + adjustVersionPosition520(packed, delta); + } + } + static int getCodeFor(Object o) { if(o == null) return nil; @@ -159,71 +215,60 @@ class TupleUtil { throw new IllegalArgumentException("Unsupported data type: " + o.getClass().getName()); } - static EncodeResult encode(Object t, boolean nested, List encoded) { + static void encode(EncodeResult result, Object t, boolean nested) { if(t == null) { if(nested) { - encoded.add(NULL_ESCAPED_ARR); - return new EncodeResult(NULL_ESCAPED_ARR.length, -1); + result.add(NULL_ESCAPED_ARR); } else { - encoded.add(NULL_ARR); - return new EncodeResult(NULL_ARR.length, -1); + result.add(NULL_ARR); } } - if(t instanceof byte[]) - return encode((byte[]) t, encoded); - if(t instanceof String) - return encode((String)t, encoded); - if(t instanceof BigInteger) - return encode((BigInteger)t, encoded); - if(t instanceof Float) - return encode((Float)t, encoded); - if(t instanceof Double) - return encode((Double)t, encoded); - if(t instanceof Boolean) - return encode((Boolean)t, encoded); - if(t instanceof UUID) - return encode((UUID)t, encoded); - if(t instanceof Number) - return encode(((Number)t).longValue(), encoded); - if(t instanceof Versionstamp) - return encode((Versionstamp)t, encoded); - if(t instanceof List) - return encode((List)t, encoded); - if(t instanceof Tuple) - return encode(((Tuple)t).getItems(), encoded); - throw new IllegalArgumentException("Unsupported data type: " + t.getClass().getName()); + else if(t instanceof byte[]) + encode(result, (byte[]) t); + else if(t instanceof String) + encode(result, (String)t); + else if(t instanceof BigInteger) + encode(result, (BigInteger)t); + else if(t instanceof Float) + encode(result, (Float)t); + else if(t instanceof Double) + encode(result, (Double)t); + else if(t instanceof Boolean) + encode(result, (Boolean)t); + else if(t instanceof UUID) + encode(result, (UUID)t); + else if(t instanceof Number) + encode(result, ((Number)t).longValue()); + else if(t instanceof Versionstamp) + encode(result, (Versionstamp)t); + else if(t instanceof List) + encode(result, (List)t); + else if(t instanceof Tuple) + encode(result, ((Tuple)t).getItems()); + else + throw new IllegalArgumentException("Unsupported data type: " + t.getClass().getName()); } - static EncodeResult encode(Object t, List encoded) { - return encode(t, false, encoded); + static void encode(EncodeResult result, Object t) { + encode(result, t, false); } - static EncodeResult encode(byte[] bytes, List encoded) { - encoded.add(BYTES_ARR); + static void encode(EncodeResult result, byte[] bytes) { byte[] escaped = ByteArrayUtil.replace(bytes, NULL_ARR, NULL_ESCAPED_ARR); - encoded.add(escaped); - encoded.add(new byte[] {nil}); - - //System.out.println("Joining bytes..."); - return new EncodeResult(2 + escaped.length,-1); + result.add(BYTES_ARR).add(escaped).add(NULL_ARR); } - static EncodeResult encode(String s, List encoded) { - encoded.add(STRING_ARR); + static void encode(EncodeResult result, String s) { byte[] escaped = ByteArrayUtil.replace(s.getBytes(UTF8), NULL_ARR, NULL_ESCAPED_ARR); - encoded.add(escaped); - encoded.add(NULL_ARR); - - //System.out.println("Joining string..."); - return new EncodeResult(2 + escaped.length, -1); + result.add(STRING_ARR).add(escaped).add(NULL_ARR); } - static EncodeResult encode(BigInteger i, List encoded) { + static void encode(EncodeResult result, BigInteger i) { //System.out.println("Encoding integral " + i); if(i.equals(BigInteger.ZERO)) { - encoded.add(new byte[]{INT_ZERO_CODE}); - return new EncodeResult(1,-1); + result.add(new byte[]{INT_ZERO_CODE}); + return; } byte[] bytes = i.toByteArray(); if(i.compareTo(BigInteger.ZERO) > 0) { @@ -232,177 +277,171 @@ class TupleUtil { if(length > 0xff) { throw new IllegalArgumentException("BigInteger magnitude is too large (more than 255 bytes)"); } - byte[] result = new byte[length + 2]; - result[0] = POS_INT_END; - result[1] = (byte)(length); - System.arraycopy(bytes, bytes.length - length, result, 2, length); - encoded.add(result); - return new EncodeResult(result.length, -1); + byte[] intBytes = new byte[length + 2]; + intBytes[0] = POS_INT_END; + intBytes[1] = (byte)(length); + System.arraycopy(bytes, bytes.length - length, intBytes, 2, length); + result.add(intBytes); } - int n = ByteArrayUtil.bisectLeft(size_limits, i); - assert n <= size_limits.length; - //byte[] bytes = ByteBuffer.allocate(8).order(ByteOrder.BIG_ENDIAN).putLong(i).array(); - //System.out.println(" -- integral has 'n' of " + n + " and output bytes of " + bytes.length); - byte[] result = new byte[n+1]; - result[0] = (byte)(INT_ZERO_CODE + n); - System.arraycopy(bytes, bytes.length - n, result, 1, n); - encoded.add(result); - return new EncodeResult(result.length, -1); - } - if(i.negate().compareTo(size_limits[size_limits.length-1]) > 0) { - int length = byteLength(i.negate().toByteArray()); - if(length > 0xff) { - throw new IllegalArgumentException("BigInteger magnitude is too large (more than 255 bytes)"); + else { + int n = ByteArrayUtil.bisectLeft(size_limits, i); + assert n <= size_limits.length; + //byte[] bytes = ByteBuffer.allocate(8).order(ByteOrder.BIG_ENDIAN).putLong(i).array(); + //System.out.println(" -- integral has 'n' of " + n + " and output bytes of " + bytes.length); + byte[] intBytes = new byte[n + 1]; + intBytes[0] = (byte) (INT_ZERO_CODE + n); + System.arraycopy(bytes, bytes.length - n, intBytes, 1, n); + result.add(intBytes); } - BigInteger offset = BigInteger.ONE.shiftLeft(length*8).subtract(BigInteger.ONE); - byte[] adjusted = i.add(offset).toByteArray(); - byte[] result = new byte[length + 2]; - result[0] = NEG_INT_START; - result[1] = (byte)(length ^ 0xff); - if(adjusted.length >= length) { - System.arraycopy(adjusted, adjusted.length - length, result, 2, length); - } else { - Arrays.fill(result, 2, result.length - adjusted.length, (byte)0x00); - System.arraycopy(adjusted, 0, result, result.length - adjusted.length, adjusted.length); + } + else { + if(i.negate().compareTo(size_limits[size_limits.length - 1]) > 0) { + int length = byteLength(i.negate().toByteArray()); + if (length > 0xff) { + throw new IllegalArgumentException("BigInteger magnitude is too large (more than 255 bytes)"); + } + BigInteger offset = BigInteger.ONE.shiftLeft(length * 8).subtract(BigInteger.ONE); + byte[] adjusted = i.add(offset).toByteArray(); + byte[] intBytes = new byte[length + 2]; + intBytes[0] = NEG_INT_START; + intBytes[1] = (byte) (length ^ 0xff); + if (adjusted.length >= length) { + System.arraycopy(adjusted, adjusted.length - length, intBytes, 2, length); + } else { + Arrays.fill(intBytes, 2, intBytes.length - adjusted.length, (byte) 0x00); + System.arraycopy(adjusted, 0, intBytes, intBytes.length - adjusted.length, adjusted.length); + } + result.add(intBytes); + } + else { + int n = ByteArrayUtil.bisectLeft(size_limits, i.negate()); + + assert n >= 0 && n < size_limits.length; // can we do this? it seems to be required for the following statement + + long maxv = size_limits[n].add(i).longValue(); + byte[] adjustedBytes = ByteBuffer.allocate(8).order(ByteOrder.BIG_ENDIAN).putLong(maxv).array(); + byte[] intBytes = new byte[n + 1]; + intBytes[0] = (byte) (20 - n); + System.arraycopy(adjustedBytes, adjustedBytes.length - n, intBytes, 1, n); + result.add(intBytes); } - encoded.add(result); - return new EncodeResult(result.length, -1); } - int n = ByteArrayUtil.bisectLeft(size_limits, i.negate()); - - assert n >= 0 && n < size_limits.length; // can we do this? it seems to be required for the following statement - - long maxv = size_limits[n].add(i).longValue(); - byte[] adjustedBytes = ByteBuffer.allocate(8).order(ByteOrder.BIG_ENDIAN).putLong(maxv).array(); - byte[] result = new byte[n+1]; - result[0] = (byte)(20 - n); - System.arraycopy(adjustedBytes, adjustedBytes.length - n, result, 1, n); - encoded.add(result); - return new EncodeResult(result.length, -1); } - static EncodeResult encode(Integer i, List encoded) { - return encode(i.longValue(), encoded); + static void encode(EncodeResult result, Integer i) { + encode(result, i.longValue()); } - static EncodeResult encode(long i, List encoded) { - return encode(BigInteger.valueOf(i), encoded); + static void encode(EncodeResult result, long i) { + encode(result, BigInteger.valueOf(i)); } - static EncodeResult encode(Float f, List encoded) { - byte[] result = ByteBuffer.allocate(5).order(ByteOrder.BIG_ENDIAN).put(FLOAT_CODE).putFloat(f).array(); - floatingPointCoding(result, 1, true); - encoded.add(result); - return new EncodeResult(result.length, -1); + static void encode(EncodeResult result, Float f) { + byte[] floatBytes = ByteBuffer.allocate(5).order(ByteOrder.BIG_ENDIAN).put(FLOAT_CODE).putFloat(f).array(); + floatingPointCoding(floatBytes, 1, true); + result.add(floatBytes); } - static EncodeResult encode(Double d, List encoded) { - byte[] result = ByteBuffer.allocate(9).order(ByteOrder.BIG_ENDIAN).put(DOUBLE_CODE).putDouble(d).array(); - floatingPointCoding(result, 1, true); - encoded.add(result); - return new EncodeResult(result.length, -1); + static void encode(EncodeResult result, Double d) { + byte[] doubleBytes = ByteBuffer.allocate(9).order(ByteOrder.BIG_ENDIAN).put(DOUBLE_CODE).putDouble(d).array(); + floatingPointCoding(doubleBytes, 1, true); + result.add(doubleBytes); } - static EncodeResult encode(Boolean b, List encoded) { - if (b) { - encoded.add(TRUE_ARR); - } else { - encoded.add(FALSE_ARR); + static void encode(EncodeResult result, Boolean b) { + if(b) { + result.add(TRUE_ARR); + } + else { + result.add(FALSE_ARR); } - return new EncodeResult(1, -1); } - static EncodeResult encode(UUID uuid, List encoded) { - byte[] result = ByteBuffer.allocate(17).put(UUID_CODE).order(ByteOrder.BIG_ENDIAN) + static void encode(EncodeResult result, UUID uuid) { + byte[] uuidBytes = ByteBuffer.allocate(17).put(UUID_CODE).order(ByteOrder.BIG_ENDIAN) .putLong(uuid.getMostSignificantBits()).putLong(uuid.getLeastSignificantBits()) .array(); - encoded.add(result); - return new EncodeResult(result.length, -1); + result.add(uuidBytes); } - static EncodeResult encode(Versionstamp v, List encoded) { - encoded.add(VERSIONSTAMP_ARR); - encoded.add(v.getBytes()); - return new EncodeResult(1 + Versionstamp.LENGTH, (v.isComplete() ? -1 : 1)); - } - - static EncodeResult encode(List value, List encoded) { - int lenSoFar = 0; - int versionPos = -1; - encoded.add(NESTED_ARR); - for(Object t : value) { - EncodeResult childResult = encode(t, true, encoded); - if(childResult.versionPos > 0) { - if(versionPos > 0) { - throw new IllegalArgumentException("Multiple incomplete Versionstamps included in Tuple"); - } - versionPos = lenSoFar + childResult.versionPos; - } - lenSoFar += childResult.totalLength; + static void encode(EncodeResult result, Versionstamp v) { + result.add(VERSIONSTAMP_ARR); + if(v.isComplete()) { + result.add(v.getBytes()); + } + else { + result.add(v.getBytes(), result.totalLength); } - encoded.add(NULL_ARR); - return new EncodeResult(lenSoFar + 2, (versionPos < 0 ? -1 : versionPos + 1)); } - static DecodeResult decode(byte[] rep, int pos, int last) { + static void encode(EncodeResult result, List value) { + result.add(NESTED_ARR); + for(Object t : value) { + encode(result, t, true); + } + result.add(NULL_ARR); + } + + static void decode(DecodeResult result, byte[] rep, int pos, int last) { //System.out.println("Decoding '" + ArrayUtils.printable(rep) + "' at " + pos); // SOMEDAY: codes over 127 will be a problem with the signed Java byte mess int code = rep[pos]; int start = pos + 1; if(code == nil) { - return new DecodeResult(start, null); + result.add(null, start); } - if(code == BYTES_CODE) { + else if(code == BYTES_CODE) { int end = ByteArrayUtil.findTerminator(rep, (byte)0x0, (byte)0xff, start, last); //System.out.println("End of byte string: " + end); byte[] range = ByteArrayUtil.replace(rep, start, end - start, NULL_ESCAPED_ARR, new byte[] { nil }); //System.out.println(" -> byte string contents: '" + ArrayUtils.printable(range) + "'"); - return new DecodeResult(end + 1, range); + result.add(range, end + 1); } - if(code == STRING_CODE) { + else if(code == STRING_CODE) { int end = ByteArrayUtil.findTerminator(rep, (byte)0x0, (byte)0xff, start, last); //System.out.println("End of UTF8 string: " + end); byte[] stringBytes = ByteArrayUtil.replace(rep, start, end - start, NULL_ESCAPED_ARR, new byte[] { nil }); String str = new String(stringBytes, UTF8); //System.out.println(" -> UTF8 string contents: '" + str + "'"); - return new DecodeResult(end + 1, str); + result.add(str, end + 1); } - if(code == FLOAT_CODE) { + else if(code == FLOAT_CODE) { byte[] resBytes = Arrays.copyOfRange(rep, start, start+4); floatingPointCoding(resBytes, 0, false); float res = ByteBuffer.wrap(resBytes).order(ByteOrder.BIG_ENDIAN).getFloat(); - return new DecodeResult(start + 4, res); + result.add(res, start + Float.BYTES); } - if(code == DOUBLE_CODE) { + else if(code == DOUBLE_CODE) { byte[] resBytes = Arrays.copyOfRange(rep, start, start+8); floatingPointCoding(resBytes, 0, false); double res = ByteBuffer.wrap(resBytes).order(ByteOrder.BIG_ENDIAN).getDouble(); - return new DecodeResult(start + 8, res); + result.add(res, start + Double.BYTES); } - if(code == FALSE_CODE) { - return new DecodeResult(start, false); + else if(code == FALSE_CODE) { + result.add(false, start); } - if(code == TRUE_CODE) { - return new DecodeResult(start, true); + else if(code == TRUE_CODE) { + result.add(true, start); } - if(code == UUID_CODE) { + else if(code == UUID_CODE) { ByteBuffer bb = ByteBuffer.wrap(rep, start, 16).order(ByteOrder.BIG_ENDIAN); long msb = bb.getLong(); long lsb = bb.getLong(); - return new DecodeResult(start + 16, new UUID(msb, lsb)); + result.add(new UUID(msb, lsb), start + 16); } - if(code == POS_INT_END) { + else if(code == POS_INT_END) { int n = rep[start] & 0xff; - return new DecodeResult(start + n + 1, new BigInteger(ByteArrayUtil.join(new byte[]{0x00}, Arrays.copyOfRange(rep, start+1, start+n+1)))); + BigInteger res = new BigInteger(ByteArrayUtil.join(new byte[]{0x00}, Arrays.copyOfRange(rep, start+1, start+n+1))); + result.add(res, start + n + 1); } - if(code == NEG_INT_START) { + else if(code == NEG_INT_START) { int n = (rep[start] ^ 0xff) & 0xff; BigInteger origValue = new BigInteger(ByteArrayUtil.join(new byte[]{0x00}, Arrays.copyOfRange(rep, start+1, start+n+1))); BigInteger offset = BigInteger.ONE.shiftLeft(n*8).subtract(BigInteger.ONE); - return new DecodeResult(start + n + 1, origValue.subtract(offset)); + result.add(origValue.subtract(offset), start + n + 1); } - if(code > NEG_INT_START && code < POS_INT_END) { + else if(code > NEG_INT_START && code < POS_INT_END) { // decode a long byte[] longBytes = new byte[9]; boolean upper = code >= INT_ZERO_CODE; @@ -426,36 +465,37 @@ class TupleUtil { val.compareTo(BigInteger.valueOf(Long.MAX_VALUE))>0) { // This can occur if the thing can be represented with 8 bytes but not // the right sign information. - return new DecodeResult(end, val); + result.add(val, end); + } else { + result.add(val.longValue(), end); } - return new DecodeResult(end, val.longValue()); } - if(code == VERSIONSTAMP_CODE) { - return new DecodeResult( - start + Versionstamp.LENGTH, - Versionstamp.fromBytes(Arrays.copyOfRange(rep, start, start + Versionstamp.LENGTH))); + else if(code == VERSIONSTAMP_CODE) { + Versionstamp val = Versionstamp.fromBytes(Arrays.copyOfRange(rep, start, start + Versionstamp.LENGTH)); + result.add(val, start + Versionstamp.LENGTH); } - if(code == NESTED_CODE) { - List items = new LinkedList(); + else if(code == NESTED_CODE) { + DecodeResult subResult = new DecodeResult(); int endPos = start; while(endPos < rep.length) { if(rep[endPos] == nil) { if(endPos + 1 < rep.length && rep[endPos+1] == (byte)0xff) { - items.add(null); + subResult.add(null, endPos + 2); endPos += 2; } else { endPos += 1; break; } } else { - DecodeResult subResult = decode(rep, endPos, last); - items.add(subResult.o); + decode(subResult, rep, endPos, last); endPos = subResult.end; } } - return new DecodeResult(endPos, items); + result.add(subResult.values, endPos); + } + else { + throw new IllegalArgumentException("Unknown tuple data type " + code + " at index " + pos); } - throw new IllegalArgumentException("Unknown tuple data type " + code + " at index " + pos); } static int compareSignedBigEndian(byte[] arr1, byte[] arr2) { @@ -539,62 +579,51 @@ class TupleUtil { } static List unpack(byte[] bytes, int start, int length) { - List items = new LinkedList<>(); + DecodeResult decodeResult = new DecodeResult(); int pos = start; int end = start + length; while(pos < end) { - DecodeResult decoded = decode(bytes, pos, end); - items.add(decoded.o); - pos = decoded.end; + decode(decodeResult, bytes, pos, end); + pos = decodeResult.end; } - return items; + return decodeResult.values; } - static EncodeResult encodeAll(List items, byte[] prefix, List encoded) { + static void encodeAll(EncodeResult result, List items, byte[] prefix) { if(prefix != null) { - encoded.add(prefix); + result.add(prefix); } - int lenSoFar = (prefix == null) ? 0 : prefix.length; - int versionPos = -1; for(Object t : items) { - EncodeResult result = encode(t, encoded); - if(result.versionPos > 0) { - if(versionPos > 0) { - throw new IllegalArgumentException("Multiple incomplete Versionstamps included in Tuple"); - } - versionPos = result.versionPos + lenSoFar; - } - lenSoFar += result.totalLength; + encode(result, t); } //System.out.println("Joining whole tuple..."); - return new EncodeResult(lenSoFar, versionPos); } static byte[] pack(List items, byte[] prefix) { - List encoded = new ArrayList<>(2 * items.size() + (prefix == null ? 0 : 1)); - EncodeResult result = encodeAll(items, prefix, encoded); - if(result.versionPos > 0) { - throw new IllegalArgumentException("Incomplete Versionstamp included in vanilla tuple pack"); + EncodeResult result = new EncodeResult(2 * items.size() + (prefix == null ? 0 : 1)); + encodeAll(result, items, prefix); + if(result.versionPos >= 0) { + throw new IllegalArgumentException("Incomplete Versionstamp included in vanilla tuple packInternal"); } else { - return ByteArrayUtil.join(null, encoded); + return ByteArrayUtil.join(null, result.encodedValues); } } static byte[] packWithVersionstamp(List items, byte[] prefix) { - List encoded = new ArrayList<>(2 * items.size() + (prefix == null ? 1 : 2)); - EncodeResult result = encodeAll(items, prefix, encoded); + EncodeResult result = new EncodeResult(2 * items.size() + (prefix == null ? 1 : 2)); + encodeAll(result, items, prefix); if(result.versionPos < 0) { - throw new IllegalArgumentException("No incomplete Versionstamp included in tuple pack with versionstamp"); + throw new IllegalArgumentException("No incomplete Versionstamp included in tuple packInternal with versionstamp"); } else { if(result.versionPos > 0xffff) { throw new IllegalArgumentException("Tuple has incomplete version at position " + result.versionPos + " which is greater than the maximum " + 0xffff); } if (FDB.instance().getAPIVersion() < 520) { - encoded.add(ByteBuffer.allocate(2).order(ByteOrder.LITTLE_ENDIAN).putShort((short)result.versionPos).array()); + result.add(ByteBuffer.allocate(Short.BYTES).order(ByteOrder.LITTLE_ENDIAN).putShort((short)result.versionPos).array()); } else { - encoded.add(ByteBuffer.allocate(4).order(ByteOrder.LITTLE_ENDIAN).putInt(result.versionPos).array()); + result.add(ByteBuffer.allocate(Integer.BYTES).order(ByteOrder.LITTLE_ENDIAN).putInt(result.versionPos).array()); } - return ByteArrayUtil.join(null, encoded); + return ByteArrayUtil.join(null, result.encodedValues); } } @@ -617,7 +646,10 @@ class TupleUtil { public static void main(String[] args) { try { byte[] bytes = pack(Collections.singletonList(4), null); - assert 4 == (Integer)(decode(bytes, 0, bytes.length).o); + DecodeResult result = new DecodeResult(); + decode(result, bytes, 0, bytes.length); + int val = (int)result.values.get(0); + assert 4 == val; } catch (Exception e) { e.printStackTrace(); System.out.println("Error " + e.getMessage()); @@ -625,7 +657,9 @@ class TupleUtil { try { byte[] bytes = pack(Collections.singletonList("\u021Aest \u0218tring"), null); - String string = (String)(decode(bytes, 0, bytes.length).o); + DecodeResult result = new DecodeResult(); + decode(result, bytes, 0, bytes.length); + String string = (String)result.values.get(0); System.out.println("contents -> " + string); assert "\u021Aest \u0218tring".equals(string); } catch (Exception e) { @@ -635,7 +669,7 @@ class TupleUtil { /*Object[] a = new Object[] { "\u0000a", -2, "b\u0001", 12345, ""}; List o = Arrays.asList(a); - byte[] packed = pack( o, null ); + byte[] packed = packInternal( o, null ); System.out.println("packed length: " + packed.length); o = unpack( packed, 0, packed.length ); System.out.println("unpacked elements: " + o); diff --git a/bindings/java/src/test/com/apple/foundationdb/test/TuplePerformanceTest.java b/bindings/java/src/test/com/apple/foundationdb/test/TuplePerformanceTest.java index df9ccf6d45..dada5131d8 100644 --- a/bindings/java/src/test/com/apple/foundationdb/test/TuplePerformanceTest.java +++ b/bindings/java/src/test/com/apple/foundationdb/test/TuplePerformanceTest.java @@ -25,17 +25,15 @@ public class TuplePerformanceTest { public Tuple createTuple(int length) { List values = new ArrayList<>(length); - for(int i = 0; i < length; i++) { + for (int i = 0; i < length; i++) { double choice = r.nextDouble(); - if(choice < 0.1) { + if (choice < 0.1) { values.add(null); - } - else if(choice < 0.2) { + } else if (choice < 0.2) { byte[] bytes = new byte[r.nextInt(20)]; r.nextBytes(bytes); values.add(bytes); - } - else if(choice < 0.3) { + } else if (choice < 0.3) { char[] chars = new char[r.nextInt(20)]; for (int j = 0; j < chars.length; j++) { chars[j] = (char)('a' + r.nextInt(26)); @@ -171,7 +169,7 @@ public class TuplePerformanceTest { } public static void main(String[] args) { - TuplePerformanceTest tester = new TuplePerformanceTest(new Random(), 100_000, 10_000); + TuplePerformanceTest tester = new TuplePerformanceTest(new Random(), 100_000, 10_000_000); tester.run(); } } From e6ce0ebd2717c1223c0f2aac9e37581e14a14516 Mon Sep 17 00:00:00 2001 From: Alec Grieser Date: Sun, 24 Feb 2019 20:49:10 -0800 Subject: [PATCH 24/46] improve tuple performance tester for more types and add serialization check in TupleTest --- .../test/TuplePerformanceTest.java | 76 ++++++++++++-- .../apple/foundationdb/test/TupleTest.java | 98 ++++++++++++++++++- 2 files changed, 162 insertions(+), 12 deletions(-) diff --git a/bindings/java/src/test/com/apple/foundationdb/test/TuplePerformanceTest.java b/bindings/java/src/test/com/apple/foundationdb/test/TuplePerformanceTest.java index dada5131d8..cf79ff41a9 100644 --- a/bindings/java/src/test/com/apple/foundationdb/test/TuplePerformanceTest.java +++ b/bindings/java/src/test/com/apple/foundationdb/test/TuplePerformanceTest.java @@ -13,30 +13,40 @@ import com.apple.foundationdb.tuple.Versionstamp; public class TuplePerformanceTest { + private enum GeneratedTypes { + ALL, + LONG, + FLOATING_POINT + } + private final Random r; private final int ignoreIterations; private final int iterations; + private final GeneratedTypes generatedTypes; - public TuplePerformanceTest(Random r, int ignoreIterations, int iterations) { + public TuplePerformanceTest(Random r, int ignoreIterations, int iterations, GeneratedTypes generatedTypes) { this.r = r; this.ignoreIterations = ignoreIterations; this.iterations = iterations; + this.generatedTypes = generatedTypes; } - public Tuple createTuple(int length) { + public Tuple createMultiTypeTuple(int length) { List values = new ArrayList<>(length); - for (int i = 0; i < length; i++) { + for(int i = 0; i < length; i++) { double choice = r.nextDouble(); - if (choice < 0.1) { + if(choice < 0.1) { values.add(null); - } else if (choice < 0.2) { + } + else if(choice < 0.2) { byte[] bytes = new byte[r.nextInt(20)]; r.nextBytes(bytes); values.add(bytes); - } else if (choice < 0.3) { + } + else if(choice < 0.3) { char[] chars = new char[r.nextInt(20)]; for (int j = 0; j < chars.length; j++) { - chars[j] = (char)('a' + r.nextInt(26)); + chars[j] = (char) ('a' + r.nextInt(26)); } values.add(new String(chars)); } @@ -67,7 +77,55 @@ public class TuplePerformanceTest { values.add(nested); } } - return Tuple.from(values); + return Tuple.fromItems(values); + } + + public Tuple createLongsTuple(int length) { + List values = new ArrayList<>(length); + for(int i = 0; i < length; i++) { + int byteLength = r.nextInt(Long.BYTES + 1); + long val = 0L; + for(int x = 0; x < byteLength; x++) { + int nextBytes = r.nextInt(256); + val = (val << 8) + nextBytes; + } + values.add(val); + } + return Tuple.fromItems(values); + } + + public Tuple createFloatingPointTuple(int length) { + List values = new ArrayList<>(length); + for(int i = 0; i < length; i++) { + double choice = r.nextDouble(); + if(choice < 0.40) { + values.add(r.nextFloat()); + } + else if(choice < 0.80) { + values.add(r.nextDouble()); + } + // These last two are more likely to produce NaN values + else if(choice < 0.90) { + values.add(Float.intBitsToFloat(r.nextInt())); + } + else { + values.add(Double.longBitsToDouble(r.nextLong())); + } + } + return Tuple.fromItems(values); + } + + public Tuple createTuple(int length) { + switch (generatedTypes) { + case ALL: + return createMultiTypeTuple(length); + case LONG: + return createLongsTuple(length); + case FLOATING_POINT: + return createFloatingPointTuple(length); + default: + throw new IllegalStateException("unknown generated types " + generatedTypes); + } } public void run() { @@ -169,7 +227,7 @@ public class TuplePerformanceTest { } public static void main(String[] args) { - TuplePerformanceTest tester = new TuplePerformanceTest(new Random(), 100_000, 10_000_000); + TuplePerformanceTest tester = new TuplePerformanceTest(new Random(), 100_000, 10_000_000, GeneratedTypes.ALL); tester.run(); } } diff --git a/bindings/java/src/test/com/apple/foundationdb/test/TupleTest.java b/bindings/java/src/test/com/apple/foundationdb/test/TupleTest.java index ad9297e02d..528c11f93a 100644 --- a/bindings/java/src/test/com/apple/foundationdb/test/TupleTest.java +++ b/bindings/java/src/test/com/apple/foundationdb/test/TupleTest.java @@ -20,24 +20,116 @@ package com.apple.foundationdb.test; -import com.apple.foundationdb.Database; -import com.apple.foundationdb.FDB; import com.apple.foundationdb.TransactionContext; +import com.apple.foundationdb.tuple.ByteArrayUtil; import com.apple.foundationdb.tuple.Tuple; +import java.math.BigInteger; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.List; +import java.util.Objects; + public class TupleTest { + private static final byte FF = (byte)0xff; + public static void main(String[] args) throws InterruptedException { final int reps = 1000; try { - FDB fdb = FDB.selectAPIVersion(610); + // FDB fdb = FDB.selectAPIVersion(610); + serializedForms(); + /* try(Database db = fdb.open()) { runTests(reps, db); } + */ } catch(Throwable t) { t.printStackTrace(); } } + private static class TupleSerialization { + private final Tuple tuple; + private final byte[] serialization; + + TupleSerialization(Tuple tuple, byte[] serialization) { + this.tuple = tuple; + this.serialization = serialization; + } + + static void addAll(List list, Object... args) { + for(int i = 0; i < args.length; i += 2) { + TupleSerialization serialization = new TupleSerialization((Tuple)args[i], (byte[])args[i + 1]); + list.add(serialization); + } + } + } + + private static void serializedForms() { + List serializations = new ArrayList<>(); + TupleSerialization.addAll(serializations, + Tuple.from(0L), new byte[]{0x14}, + Tuple.from(BigInteger.ZERO), new byte[]{0x14}, + Tuple.from(1L), new byte[]{0x15, 0x01}, + Tuple.from(BigInteger.ONE), new byte[]{0x15, 0x01}, + Tuple.from(-1L), new byte[]{0x13, FF - 1}, + Tuple.from(BigInteger.ONE.negate()), new byte[]{0x13, FF - 1}, + Tuple.from(255L), new byte[]{0x15, FF}, + Tuple.from(BigInteger.valueOf(255)), new byte[]{0x15, FF}, + Tuple.from(-255L), new byte[]{0x13, 0x00}, + Tuple.from(BigInteger.valueOf(-255)), new byte[]{0x13, 0x00}, + Tuple.from(256L), new byte[]{0x16, 0x01, 0x00}, + Tuple.from(BigInteger.valueOf(256)), new byte[]{0x16, 0x01, 0x00}, + Tuple.from(-256L), new byte[]{0x12, FF - 1, FF}, + Tuple.from(BigInteger.valueOf(-256)), new byte[]{0x12, FF - 1, FF}, + Tuple.from(65536), new byte[]{0x17, 0x01, 0x00, 0x00}, + Tuple.from(-65536), new byte[]{0x11, FF - 1, FF, FF}, + Tuple.from(Long.MAX_VALUE), new byte[]{0x1C, 0x7f, FF, FF, FF, FF, FF, FF, FF}, + Tuple.from(BigInteger.valueOf(Long.MAX_VALUE)), new byte[]{0x1C, 0x7f, FF, FF, FF, FF, FF, FF, FF}, + Tuple.from(BigInteger.valueOf(Long.MAX_VALUE).add(BigInteger.ONE)), new byte[]{0x1C, (byte)0x80, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00}, + Tuple.from(BigInteger.ONE.shiftLeft(64).subtract(BigInteger.ONE)), new byte[]{0x1C, FF, FF, FF, FF, FF, FF, FF, FF}, + Tuple.from(BigInteger.ONE.shiftLeft(64)), new byte[]{0x1D, 0x09, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00}, + Tuple.from(-((1L << 32) - 1)), new byte[]{0x10, 0x00, 0x00, 0x00, 0x00}, + Tuple.from(BigInteger.ONE.shiftLeft(32).subtract(BigInteger.ONE).negate()), new byte[]{0x10, 0x00, 0x00, 0x00, 0x00}, + Tuple.from(Long.MIN_VALUE + 2), new byte[]{0x0C, (byte)0x80, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01}, + Tuple.from(Long.MIN_VALUE + 1), new byte[]{0x0C, (byte)0x80, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00}, + Tuple.from(BigInteger.valueOf(Long.MIN_VALUE).add(BigInteger.ONE)), new byte[]{0x0C, (byte)0x80, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00}, + Tuple.from(Long.MIN_VALUE), new byte[]{0x0C, 0x7f, FF, FF, FF, FF, FF, FF, FF}, + Tuple.from(BigInteger.valueOf(Long.MIN_VALUE)), new byte[]{0x0C, 0x7f, FF, FF, FF, FF, FF, FF, FF}, + Tuple.from(BigInteger.valueOf(Long.MIN_VALUE).subtract(BigInteger.ONE)), new byte[]{0x0C, 0x7f, FF, FF, FF, FF, FF, FF, FF - 1}, + Tuple.from(BigInteger.ONE.shiftLeft(64).subtract(BigInteger.ONE).negate()), new byte[]{0x0C, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00}, + Tuple.from(3.14f), new byte[]{0x20, (byte)0xc0, 0x48, (byte)0xf5, (byte)0xc3}, + Tuple.from(-3.14f), new byte[]{0x20, (byte)0x3f, (byte)0xb7, (byte)0x0a, (byte)0x3c}, + Tuple.from(3.14), new byte[]{0x21, (byte)0xc0, (byte)0x09, (byte)0x1e, (byte)0xb8, (byte)0x51, (byte)0xeb, (byte)0x85, (byte)0x1f}, + Tuple.from(-3.14), new byte[]{0x21, (byte)0x3f, (byte)0xf6, (byte)0xe1, (byte)0x47, (byte)0xae, (byte)0x14, (byte)0x7a, (byte)0xe0}, + Tuple.from(0.0f), new byte[]{0x20, (byte)0x80, 0x00, 0x00, 0x00}, + Tuple.from(-0.0f), new byte[]{0x20, 0x7f, FF, FF, FF}, + Tuple.from(0.0), new byte[]{0x21, (byte)0x80, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00}, + Tuple.from(-0.0), new byte[]{0x21, 0x7f, FF, FF, FF, FF, FF, FF, FF}, + Tuple.from(Float.POSITIVE_INFINITY), new byte[]{0x20, FF, (byte)0x80, 0x00, 0x00}, + Tuple.from(Float.NEGATIVE_INFINITY), new byte[]{0x20, 0x00, 0x7f, FF, FF}, + Tuple.from(Double.POSITIVE_INFINITY), new byte[]{0x21, FF, (byte)0xf0, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00}, + Tuple.from(Double.NEGATIVE_INFINITY), new byte[]{0x21, 0x00, 0x0f, FF, FF, FF, FF, FF, FF}, + Tuple.from(Float.intBitsToFloat(Integer.MAX_VALUE)), new byte[]{0x20, FF, FF, FF, FF}, + Tuple.from(Double.longBitsToDouble(Long.MAX_VALUE)), new byte[]{0x21, FF, FF, FF, FF, FF, FF, FF, FF}, + Tuple.from(Float.intBitsToFloat(~0)), new byte[]{0x20, 0x00, 0x00, 0x00, 0x00}, + Tuple.from(Double.longBitsToDouble(~0L)), new byte[]{0x21, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00} + ); + + for(TupleSerialization serialization : serializations) { + System.out.println("Packing " + serialization.tuple + " (expecting: " + ByteArrayUtil.printable(serialization.serialization) + ")"); + if(!Arrays.equals(serialization.tuple.pack(), serialization.serialization)) { + throw new RuntimeException("Tuple " + serialization.tuple + " has serialization " + ByteArrayUtil.printable(serialization.tuple.pack()) + + " which does not match expected serialization " + ByteArrayUtil.printable(serialization.serialization)); + } + if(!Objects.equals(serialization.tuple, Tuple.fromBytes(serialization.serialization))) { + throw new RuntimeException("Tuple " + serialization.tuple + " does not match deserialization " + Tuple.fromBytes(serialization.serialization) + + " which comes from serialization " + ByteArrayUtil.printable(serialization.serialization)); + } + } + System.out.println("All tuples had matching serializations"); + } + private static void runTests(final int reps, TransactionContext db) { System.out.println("Running tests..."); long start = System.currentTimeMillis(); From e9771364d797133444623ec4ebf7ce0bce5d517e Mon Sep 17 00:00:00 2001 From: Alec Grieser Date: Sun, 24 Feb 2019 20:52:28 -0800 Subject: [PATCH 25/46] various Java tuple performance tweaks These include: * Memoizing packed representations within Tuples * Using longs instead of BigIntegers if possible * As much as possible sticking to manipulating primitive types when using floats/doubles --- .../foundationdb/tuple/ByteArrayUtil.java | 3 +- .../com/apple/foundationdb/tuple/Tuple.java | 14 +- .../apple/foundationdb/tuple/TupleUtil.java | 446 ++++++++++-------- 3 files changed, 252 insertions(+), 211 deletions(-) diff --git a/bindings/java/src/main/com/apple/foundationdb/tuple/ByteArrayUtil.java b/bindings/java/src/main/com/apple/foundationdb/tuple/ByteArrayUtil.java index 247ae78fb0..eeea3e1799 100644 --- a/bindings/java/src/main/com/apple/foundationdb/tuple/ByteArrayUtil.java +++ b/bindings/java/src/main/com/apple/foundationdb/tuple/ByteArrayUtil.java @@ -229,8 +229,7 @@ public class ByteArrayUtil { int n = Arrays.binarySearch(arr, i); if(n >= 0) return n; - int ip = (n + 1) * -1; - return ip; + return (n + 1) * -1; } /** diff --git a/bindings/java/src/main/com/apple/foundationdb/tuple/Tuple.java b/bindings/java/src/main/com/apple/foundationdb/tuple/Tuple.java index 7b14632452..b3761d8c5d 100644 --- a/bindings/java/src/main/com/apple/foundationdb/tuple/Tuple.java +++ b/bindings/java/src/main/com/apple/foundationdb/tuple/Tuple.java @@ -824,9 +824,12 @@ public class Tuple implements Comparable, Iterable { } /** - * Get the number of bytes in the packed representation of this {@code Tuple}. + * Get the number of bytes in the packed representation of this {@code Tuple}. Note that at the + * moment, this number is calculated by packing the {@code Tuple} and looking at its size. This method + * will memoize the result, however, so asking the same {@code Tuple} for its size multiple times + * is a fast operation. * - * @return + * @return the number of bytes in the packed representation of this {@code Tuple} */ public int getPackedSize() { byte[] p = packMaybeVersionstamp(null); @@ -847,7 +850,12 @@ public class Tuple implements Comparable, Iterable { */ @Override public int compareTo(Tuple t) { - return comparator.compare(elements, t.elements); + if(packed != null && t.packed != null) { + return ByteArrayUtil.compareUnsigned(packed, t.packed); + } + else { + return comparator.compare(elements, t.elements); + } } /** diff --git a/bindings/java/src/main/com/apple/foundationdb/tuple/TupleUtil.java b/bindings/java/src/main/com/apple/foundationdb/tuple/TupleUtil.java index f25828f47d..5b220d2c90 100644 --- a/bindings/java/src/main/com/apple/foundationdb/tuple/TupleUtil.java +++ b/bindings/java/src/main/com/apple/foundationdb/tuple/TupleUtil.java @@ -36,8 +36,10 @@ import com.apple.foundationdb.FDB; class TupleUtil { private static final byte nil = 0x00; - private static final BigInteger[] size_limits; + private static final BigInteger[] BIG_INT_SIZE_LIMITS; private static final Charset UTF8; + private static final BigInteger LONG_MIN_VALUE = BigInteger.valueOf(Long.MIN_VALUE); + private static final BigInteger LONG_MAX_VALUE = BigInteger.valueOf(Long.MAX_VALUE); private static final IterableComparator iterableComparator; private static final byte BYTES_CODE = 0x01; @@ -55,27 +57,28 @@ class TupleUtil { private static final byte[] NULL_ARR = new byte[] {nil}; private static final byte[] NULL_ESCAPED_ARR = new byte[] {nil, (byte)0xFF}; - private static final byte[] BYTES_ARR = new byte[]{0x01}; - private static final byte[] STRING_ARR = new byte[]{0x02}; - private static final byte[] NESTED_ARR = new byte[]{0x05}; - private static final byte[] FALSE_ARR = new byte[]{0x26}; - private static final byte[] TRUE_ARR = new byte[]{0x27}; - private static final byte[] VERSIONSTAMP_ARR = new byte[]{0x33}; + private static final byte[] BYTES_ARR = new byte[]{BYTES_CODE}; + private static final byte[] STRING_ARR = new byte[]{STRING_CODE}; + private static final byte[] NESTED_ARR = new byte[]{NESTED_CODE}; + private static final byte[] INT_ZERO_ARR = new byte[]{INT_ZERO_CODE}; + private static final byte[] FALSE_ARR = new byte[]{FALSE_CODE}; + private static final byte[] TRUE_ARR = new byte[]{TRUE_CODE}; + private static final byte[] VERSIONSTAMP_ARR = new byte[]{VERSIONSTAMP_CODE}; static { - size_limits = new BigInteger[9]; - for(int i = 0; i < 9; i++) { - size_limits[i] = (BigInteger.ONE).shiftLeft(i * 8).subtract(BigInteger.ONE); + BIG_INT_SIZE_LIMITS = new BigInteger[9]; + for(int i = 0; i < BIG_INT_SIZE_LIMITS.length; i++) { + BIG_INT_SIZE_LIMITS[i] = (BigInteger.ONE).shiftLeft(i * 8).subtract(BigInteger.ONE); } UTF8 = Charset.forName("UTF-8"); iterableComparator = new IterableComparator(); } - static class DecodeResult { + static class DecodeState { final List values; int end; - DecodeResult() { + DecodeState() { values = new ArrayList<>(); end = 0; } @@ -86,18 +89,18 @@ class TupleUtil { } } - static class EncodeResult { + static class EncodeState { final List encodedValues; int totalLength; int versionPos; - EncodeResult(int capacity) { + EncodeState(int capacity) { this.encodedValues = new ArrayList<>(capacity); totalLength = 0; versionPos = -1; } - EncodeResult add(byte[] encoded, int versionPos) { + EncodeState add(byte[] encoded, int versionPos) { if(versionPos >= 0 && this.versionPos >= 0) { throw new IllegalArgumentException("Multiple incomplete Versionstamps included in Tuple"); } @@ -107,7 +110,7 @@ class TupleUtil { return this; } - EncodeResult add(byte[] encoded) { + EncodeState add(byte[] encoded) { encodedValues.add(encoded); totalLength += encoded.length; return this; @@ -122,37 +125,37 @@ class TupleUtil { return 0; } - /** - * Takes the Big-Endian byte representation of a floating point number and adjusts - * it so that it sorts correctly. For encoding, if the sign bit is 1 (the number - * is negative), then we need to flip all of the bits; otherwise, just flip the - * sign bit. For decoding, if the sign bit is 0 (the number is negative), then - * we also need to flip all of the bits; otherwise, just flip the sign bit. - * This will mutate in place the given array. - * - * @param bytes Big-Endian IEEE encoding of a floating point number - * @param start the (zero-indexed) first byte in the array to mutate - * @param encode true if we encoding the float and false if we are decoding - * @return the encoded {@code byte[]} - */ - static byte[] floatingPointCoding(byte[] bytes, int start, boolean encode) { - if(encode && (bytes[start] & (byte)0x80) != (byte)0x00) { - for(int i = start; i < bytes.length; i++) { - bytes[i] = (byte) (bytes[i] ^ 0xff); - } - } else if(!encode && (bytes[start] & (byte)0x80) != (byte)0x80) { - for(int i = start; i < bytes.length; i++) { - bytes[i] = (byte) (bytes[i] ^ 0xff); - } - } else { - bytes[start] = (byte) (0x80 ^ bytes[start]); - } + // These four functions are for adjusting the encoding of floating point numbers so + // that when their byte representation is written out in big-endian order, unsigned + // lexicographic byte comparison orders the values in the same way as the semantic + // ordering of the values. This means flipping all bits for negative values and flipping + // only the most-significant bit (i.e., the sign bit as all values in Java are signed) + // in the case that the number is positive. For these purposes, 0.0 is positive and -0.0 + // is negative. - return bytes; + static int encodeFloatBits(float f) { + int intBits = Float.floatToRawIntBits(f); + return (intBits < 0) ? (~intBits) : (intBits ^ Integer.MIN_VALUE); } - static byte[] join(List items) { - return ByteArrayUtil.join(null, items); + static long encodeDoubleBits(double d) { + long longBits = Double.doubleToRawLongBits(d); + return (longBits < 0L) ? (~longBits) : (longBits ^ Long.MIN_VALUE); + } + + static float decodeFloatBits(int i) { + int origBits = (i >= 0) ? (~i) : (i ^ Integer.MIN_VALUE); + return Float.intBitsToFloat(origBits); + } + + static double decodeDoubleBits(long l) { + long origBits = (l >= 0) ? (~l) : (l ^ Long.MIN_VALUE); + return Double.longBitsToDouble(origBits); + } + + // Get the number of bytes in the representation of a long. + static int byteCount(long i) { + return (Long.SIZE + 7 - Long.numberOfLeadingZeros(i >= 0 ? i : -i)) / 8; } private static void adjustVersionPosition300(byte[] packed, int delta) { @@ -215,64 +218,64 @@ class TupleUtil { throw new IllegalArgumentException("Unsupported data type: " + o.getClass().getName()); } - static void encode(EncodeResult result, Object t, boolean nested) { + static void encode(EncodeState state, Object t, boolean nested) { if(t == null) { if(nested) { - result.add(NULL_ESCAPED_ARR); + state.add(NULL_ESCAPED_ARR); } else { - result.add(NULL_ARR); + state.add(NULL_ARR); } } else if(t instanceof byte[]) - encode(result, (byte[]) t); + encode(state, (byte[]) t); else if(t instanceof String) - encode(result, (String)t); - else if(t instanceof BigInteger) - encode(result, (BigInteger)t); + encode(state, (String)t); else if(t instanceof Float) - encode(result, (Float)t); + encode(state, (Float)t); else if(t instanceof Double) - encode(result, (Double)t); + encode(state, (Double)t); else if(t instanceof Boolean) - encode(result, (Boolean)t); + encode(state, (Boolean)t); else if(t instanceof UUID) - encode(result, (UUID)t); + encode(state, (UUID)t); + else if(t instanceof BigInteger) + encode(state, (BigInteger)t); else if(t instanceof Number) - encode(result, ((Number)t).longValue()); + encode(state, ((Number)t).longValue()); else if(t instanceof Versionstamp) - encode(result, (Versionstamp)t); + encode(state, (Versionstamp)t); else if(t instanceof List) - encode(result, (List)t); + encode(state, (List)t); else if(t instanceof Tuple) - encode(result, ((Tuple)t).getItems()); + encode(state, ((Tuple)t).getItems()); else throw new IllegalArgumentException("Unsupported data type: " + t.getClass().getName()); } - static void encode(EncodeResult result, Object t) { - encode(result, t, false); + static void encode(EncodeState state, Object t) { + encode(state, t, false); } - static void encode(EncodeResult result, byte[] bytes) { + static void encode(EncodeState state, byte[] bytes) { byte[] escaped = ByteArrayUtil.replace(bytes, NULL_ARR, NULL_ESCAPED_ARR); - result.add(BYTES_ARR).add(escaped).add(NULL_ARR); + state.add(BYTES_ARR).add(escaped).add(NULL_ARR); } - static void encode(EncodeResult result, String s) { + static void encode(EncodeState state, String s) { byte[] escaped = ByteArrayUtil.replace(s.getBytes(UTF8), NULL_ARR, NULL_ESCAPED_ARR); - result.add(STRING_ARR).add(escaped).add(NULL_ARR); + state.add(STRING_ARR).add(escaped).add(NULL_ARR); } - static void encode(EncodeResult result, BigInteger i) { + static void encode(EncodeState state, BigInteger i) { //System.out.println("Encoding integral " + i); if(i.equals(BigInteger.ZERO)) { - result.add(new byte[]{INT_ZERO_CODE}); + state.add(INT_ZERO_ARR); return; } byte[] bytes = i.toByteArray(); if(i.compareTo(BigInteger.ZERO) > 0) { - if(i.compareTo(size_limits[size_limits.length-1]) > 0) { + if(i.compareTo(BIG_INT_SIZE_LIMITS[BIG_INT_SIZE_LIMITS.length-1]) > 0) { int length = byteLength(bytes); if(length > 0xff) { throw new IllegalArgumentException("BigInteger magnitude is too large (more than 255 bytes)"); @@ -281,21 +284,20 @@ class TupleUtil { intBytes[0] = POS_INT_END; intBytes[1] = (byte)(length); System.arraycopy(bytes, bytes.length - length, intBytes, 2, length); - result.add(intBytes); + state.add(intBytes); } else { - int n = ByteArrayUtil.bisectLeft(size_limits, i); - assert n <= size_limits.length; - //byte[] bytes = ByteBuffer.allocate(8).order(ByteOrder.BIG_ENDIAN).putLong(i).array(); + int n = ByteArrayUtil.bisectLeft(BIG_INT_SIZE_LIMITS, i); + assert n <= BIG_INT_SIZE_LIMITS.length; //System.out.println(" -- integral has 'n' of " + n + " and output bytes of " + bytes.length); byte[] intBytes = new byte[n + 1]; intBytes[0] = (byte) (INT_ZERO_CODE + n); System.arraycopy(bytes, bytes.length - n, intBytes, 1, n); - result.add(intBytes); + state.add(intBytes); } } else { - if(i.negate().compareTo(size_limits[size_limits.length - 1]) > 0) { + if(i.negate().compareTo(BIG_INT_SIZE_LIMITS[BIG_INT_SIZE_LIMITS.length - 1]) > 0) { int length = byteLength(i.negate().toByteArray()); if (length > 0xff) { throw new IllegalArgumentException("BigInteger magnitude is too large (more than 255 bytes)"); @@ -311,92 +313,109 @@ class TupleUtil { Arrays.fill(intBytes, 2, intBytes.length - adjusted.length, (byte) 0x00); System.arraycopy(adjusted, 0, intBytes, intBytes.length - adjusted.length, adjusted.length); } - result.add(intBytes); + state.add(intBytes); } else { - int n = ByteArrayUtil.bisectLeft(size_limits, i.negate()); + int n = ByteArrayUtil.bisectLeft(BIG_INT_SIZE_LIMITS, i.negate()); - assert n >= 0 && n < size_limits.length; // can we do this? it seems to be required for the following statement + assert n >= 0 && n < BIG_INT_SIZE_LIMITS.length; // can we do this? it seems to be required for the following statement - long maxv = size_limits[n].add(i).longValue(); + long maxv = BIG_INT_SIZE_LIMITS[n].add(i).longValue(); byte[] adjustedBytes = ByteBuffer.allocate(8).order(ByteOrder.BIG_ENDIAN).putLong(maxv).array(); byte[] intBytes = new byte[n + 1]; - intBytes[0] = (byte) (20 - n); + intBytes[0] = (byte) (INT_ZERO_CODE - n); System.arraycopy(adjustedBytes, adjustedBytes.length - n, intBytes, 1, n); - result.add(intBytes); + state.add(intBytes); } } } - static void encode(EncodeResult result, Integer i) { - encode(result, i.longValue()); + static void encode(EncodeState state, long i) { + if(i == 0L) { + state.add(INT_ZERO_ARR); + return; + } + int n = byteCount(i); + byte[] intBytes = new byte[n + 1]; + // First byte encodes number of bytes (as difference from INT_ZERO_CODE) + intBytes[0] = (byte)(INT_ZERO_CODE + (i >= 0 ? n : -n)); + // For positive integers, copy the bytes in big-endian order excluding leading 0x00 bytes. + // For negative integers, copy the bytes of the one's complement representation excluding + // the leading 0xff bytes. As Java stores negative values in two's complement, we subtract 1 + // from negative values. + long val = Long.reverseBytes((i >= 0) ? i : (i - 1)) >> (Long.SIZE - 8 * n); + for(int x = 1; x < intBytes.length; x++) { + intBytes[x] = (byte)(val & 0xff); + val >>= 8; + } + state.add(intBytes); } - static void encode(EncodeResult result, long i) { - encode(result, BigInteger.valueOf(i)); + static void encode(EncodeState state, Float f) { + byte[] floatBytes = ByteBuffer.allocate(1 + Float.BYTES).order(ByteOrder.BIG_ENDIAN) + .put(FLOAT_CODE) + .putInt(encodeFloatBits(f)) + .array(); + state.add(floatBytes); } - static void encode(EncodeResult result, Float f) { - byte[] floatBytes = ByteBuffer.allocate(5).order(ByteOrder.BIG_ENDIAN).put(FLOAT_CODE).putFloat(f).array(); - floatingPointCoding(floatBytes, 1, true); - result.add(floatBytes); + static void encode(EncodeState state, Double d) { + byte[] doubleBytes = ByteBuffer.allocate(1 + Double.BYTES).order(ByteOrder.BIG_ENDIAN) + .put(DOUBLE_CODE) + .putLong(encodeDoubleBits(d)) + .array(); + state.add(doubleBytes); } - static void encode(EncodeResult result, Double d) { - byte[] doubleBytes = ByteBuffer.allocate(9).order(ByteOrder.BIG_ENDIAN).put(DOUBLE_CODE).putDouble(d).array(); - floatingPointCoding(doubleBytes, 1, true); - result.add(doubleBytes); - } - - static void encode(EncodeResult result, Boolean b) { + static void encode(EncodeState state, Boolean b) { if(b) { - result.add(TRUE_ARR); + state.add(TRUE_ARR); } else { - result.add(FALSE_ARR); + state.add(FALSE_ARR); } } - static void encode(EncodeResult result, UUID uuid) { + static void encode(EncodeState state, UUID uuid) { byte[] uuidBytes = ByteBuffer.allocate(17).put(UUID_CODE).order(ByteOrder.BIG_ENDIAN) .putLong(uuid.getMostSignificantBits()).putLong(uuid.getLeastSignificantBits()) .array(); - result.add(uuidBytes); + state.add(uuidBytes); } - static void encode(EncodeResult result, Versionstamp v) { - result.add(VERSIONSTAMP_ARR); + static void encode(EncodeState state, Versionstamp v) { + state.add(VERSIONSTAMP_ARR); if(v.isComplete()) { - result.add(v.getBytes()); + state.add(v.getBytes()); } else { - result.add(v.getBytes(), result.totalLength); + state.add(v.getBytes(), state.totalLength); } } - static void encode(EncodeResult result, List value) { - result.add(NESTED_ARR); + static void encode(EncodeState state, List value) { + state.add(NESTED_ARR); for(Object t : value) { - encode(result, t, true); + encode(state, t, true); } - result.add(NULL_ARR); + state.add(NULL_ARR); } - static void decode(DecodeResult result, byte[] rep, int pos, int last) { + static void decode(DecodeState state, byte[] rep, int pos, int last) { //System.out.println("Decoding '" + ArrayUtils.printable(rep) + "' at " + pos); // SOMEDAY: codes over 127 will be a problem with the signed Java byte mess int code = rep[pos]; int start = pos + 1; if(code == nil) { - result.add(null, start); + state.add(null, start); } else if(code == BYTES_CODE) { int end = ByteArrayUtil.findTerminator(rep, (byte)0x0, (byte)0xff, start, last); //System.out.println("End of byte string: " + end); byte[] range = ByteArrayUtil.replace(rep, start, end - start, NULL_ESCAPED_ARR, new byte[] { nil }); //System.out.println(" -> byte string contents: '" + ArrayUtils.printable(range) + "'"); - result.add(range, end + 1); + state.add(range, end + 1); } else if(code == STRING_CODE) { int end = ByteArrayUtil.findTerminator(rep, (byte)0x0, (byte)0xff, start, last); @@ -404,78 +423,91 @@ class TupleUtil { byte[] stringBytes = ByteArrayUtil.replace(rep, start, end - start, NULL_ESCAPED_ARR, new byte[] { nil }); String str = new String(stringBytes, UTF8); //System.out.println(" -> UTF8 string contents: '" + str + "'"); - result.add(str, end + 1); + state.add(str, end + 1); } else if(code == FLOAT_CODE) { - byte[] resBytes = Arrays.copyOfRange(rep, start, start+4); - floatingPointCoding(resBytes, 0, false); - float res = ByteBuffer.wrap(resBytes).order(ByteOrder.BIG_ENDIAN).getFloat(); - result.add(res, start + Float.BYTES); + int rawFloatBits = ByteBuffer.wrap(rep, start, Float.BYTES).getInt(); + float res = decodeFloatBits(rawFloatBits); + state.add(res, start + Float.BYTES); } else if(code == DOUBLE_CODE) { - byte[] resBytes = Arrays.copyOfRange(rep, start, start+8); - floatingPointCoding(resBytes, 0, false); - double res = ByteBuffer.wrap(resBytes).order(ByteOrder.BIG_ENDIAN).getDouble(); - result.add(res, start + Double.BYTES); + long rawDoubleBits = ByteBuffer.wrap(rep, start, Double.BYTES).getLong(); + double res = decodeDoubleBits(rawDoubleBits); + state.add(res, start + Double.BYTES); } else if(code == FALSE_CODE) { - result.add(false, start); + state.add(false, start); } else if(code == TRUE_CODE) { - result.add(true, start); + state.add(true, start); } else if(code == UUID_CODE) { ByteBuffer bb = ByteBuffer.wrap(rep, start, 16).order(ByteOrder.BIG_ENDIAN); long msb = bb.getLong(); long lsb = bb.getLong(); - result.add(new UUID(msb, lsb), start + 16); + state.add(new UUID(msb, lsb), start + 16); } else if(code == POS_INT_END) { int n = rep[start] & 0xff; BigInteger res = new BigInteger(ByteArrayUtil.join(new byte[]{0x00}, Arrays.copyOfRange(rep, start+1, start+n+1))); - result.add(res, start + n + 1); + state.add(res, start + n + 1); } else if(code == NEG_INT_START) { int n = (rep[start] ^ 0xff) & 0xff; BigInteger origValue = new BigInteger(ByteArrayUtil.join(new byte[]{0x00}, Arrays.copyOfRange(rep, start+1, start+n+1))); BigInteger offset = BigInteger.ONE.shiftLeft(n*8).subtract(BigInteger.ONE); - result.add(origValue.subtract(offset), start + n + 1); + state.add(origValue.subtract(offset), start + n + 1); } else if(code > NEG_INT_START && code < POS_INT_END) { // decode a long - byte[] longBytes = new byte[9]; - boolean upper = code >= INT_ZERO_CODE; - int n = upper ? code - 20 : 20 - code; + boolean positive = code >= INT_ZERO_CODE; + int n = positive ? code - INT_ZERO_CODE : INT_ZERO_CODE - code; int end = start + n; if(rep.length < end) { throw new RuntimeException("Invalid tuple (possible truncation)"); } - System.arraycopy(rep, start, longBytes, longBytes.length-n, n); - if (!upper) - for(int i=longBytes.length-n; i 0)) { + long res = 0L; + for(int i = start; i < end; i++) { + res = (res << 8) + (rep[i] & 0xff); + } + state.add(res, end); + } + else if(!positive && (n < Long.BYTES || rep[start] < 0)) { + long res = ~0L; + for(int i = start; i < end; i++) { + res = (res << 8) + (rep[i] & 0xff); + } + state.add(res + 1, end); + } + else { + byte[] longBytes = new byte[9]; + System.arraycopy(rep, start, longBytes, longBytes.length-n, n); + if (!positive) + for(int i=longBytes.length-n; i0) { - // This can occur if the thing can be represented with 8 bytes but not - // the right sign information. - result.add(val, end); - } else { - result.add(val.longValue(), end); + // Convert to long if in range -- otherwise, leave as BigInteger. + if (val.compareTo(LONG_MIN_VALUE) >= 0 && val.compareTo(LONG_MAX_VALUE) <= 0) { + state.add(val.longValue(), end); + } else { + // This can occur if the thing can be represented with 8 bytes but not + // the right sign information. + state.add(val, end); + } } } else if(code == VERSIONSTAMP_CODE) { Versionstamp val = Versionstamp.fromBytes(Arrays.copyOfRange(rep, start, start + Versionstamp.LENGTH)); - result.add(val, start + Versionstamp.LENGTH); + state.add(val, start + Versionstamp.LENGTH); } else if(code == NESTED_CODE) { - DecodeResult subResult = new DecodeResult(); + DecodeState subResult = new DecodeState(); int endPos = start; while(endPos < rep.length) { if(rep[endPos] == nil) { @@ -491,25 +523,13 @@ class TupleUtil { endPos = subResult.end; } } - result.add(subResult.values, endPos); + state.add(subResult.values, endPos); } else { throw new IllegalArgumentException("Unknown tuple data type " + code + " at index " + pos); } } - static int compareSignedBigEndian(byte[] arr1, byte[] arr2) { - if(arr1[0] < 0 && arr2[0] < 0) { - return -1 * ByteArrayUtil.compareUnsigned(arr1, arr2); - } else if(arr1[0] < 0) { - return -1; - } else if(arr2[0] < 0) { - return 1; - } else { - return ByteArrayUtil.compareUnsigned(arr1, arr2); - } - } - static int compareItems(Object item1, Object item2) { int code1 = TupleUtil.getCodeFor(item1); int code2 = TupleUtil.getCodeFor(item2); @@ -529,33 +549,39 @@ class TupleUtil { return ByteArrayUtil.compareUnsigned(((String)item1).getBytes(UTF8), ((String)item2).getBytes(UTF8)); } if(code1 == INT_ZERO_CODE) { - BigInteger bi1; - if(item1 instanceof BigInteger) { - bi1 = (BigInteger)item1; - } else { - bi1 = BigInteger.valueOf(((Number)item1).longValue()); + if(item1 instanceof Long && item2 instanceof Long) { + // This should be the common case, so it's probably worth including as a way out. + return Long.compare((Long)item1, (Long)item2); } - BigInteger bi2; - if(item2 instanceof BigInteger) { - bi2 = (BigInteger)item2; - } else { - bi2 = BigInteger.valueOf(((Number)item2).longValue()); + else { + BigInteger bi1; + if (item1 instanceof BigInteger) { + bi1 = (BigInteger) item1; + } else { + bi1 = BigInteger.valueOf(((Number) item1).longValue()); + } + BigInteger bi2; + if (item2 instanceof BigInteger) { + bi2 = (BigInteger) item2; + } else { + bi2 = BigInteger.valueOf(((Number) item2).longValue()); + } + return bi1.compareTo(bi2); } - return bi1.compareTo(bi2); - } - if(code1 == DOUBLE_CODE) { - // This is done over vanilla double comparison basically to handle NaN - // sorting correctly. - byte[] dBytes1 = ByteBuffer.allocate(8).putDouble((Double)item1).array(); - byte[] dBytes2 = ByteBuffer.allocate(8).putDouble((Double)item2).array(); - return compareSignedBigEndian(dBytes1, dBytes2); } if(code1 == FLOAT_CODE) { // This is done for the same reason that double comparison is done // that way. - byte[] fBytes1 = ByteBuffer.allocate(4).putFloat((Float)item1).array(); - byte[] fBytes2 = ByteBuffer.allocate(4).putFloat((Float)item2).array(); - return compareSignedBigEndian(fBytes1, fBytes2); + int fbits1 = encodeFloatBits((Float)item1); + int fbits2 = encodeFloatBits((Float)item2); + return Integer.compareUnsigned(fbits1, fbits2); + } + if(code1 == DOUBLE_CODE) { + // This is done over vanilla double comparison basically to handle NaN + // sorting correctly. + long dbits1 = encodeDoubleBits((Double)item1); + long dbits2 = encodeDoubleBits((Double)item2); + return Long.compareUnsigned(dbits1, dbits2); } if(code1 == FALSE_CODE) { return Boolean.compare((Boolean)item1, (Boolean)item2); @@ -579,51 +605,53 @@ class TupleUtil { } static List unpack(byte[] bytes, int start, int length) { - DecodeResult decodeResult = new DecodeResult(); + DecodeState decodeState = new DecodeState(); int pos = start; int end = start + length; while(pos < end) { - decode(decodeResult, bytes, pos, end); - pos = decodeResult.end; + decode(decodeState, bytes, pos, end); + pos = decodeState.end; } - return decodeResult.values; + return decodeState.values; } - static void encodeAll(EncodeResult result, List items, byte[] prefix) { + static void encodeAll(EncodeState state, List items, byte[] prefix) { if(prefix != null) { - result.add(prefix); + state.add(prefix); } for(Object t : items) { - encode(result, t); + encode(state, t); } //System.out.println("Joining whole tuple..."); } static byte[] pack(List items, byte[] prefix) { - EncodeResult result = new EncodeResult(2 * items.size() + (prefix == null ? 0 : 1)); - encodeAll(result, items, prefix); - if(result.versionPos >= 0) { + EncodeState state = new EncodeState(2 * items.size() + (prefix == null ? 0 : 1)); + encodeAll(state, items, prefix); + if(state.versionPos >= 0) { throw new IllegalArgumentException("Incomplete Versionstamp included in vanilla tuple packInternal"); - } else { - return ByteArrayUtil.join(null, result.encodedValues); + } + else { + return ByteArrayUtil.join(null, state.encodedValues); } } static byte[] packWithVersionstamp(List items, byte[] prefix) { - EncodeResult result = new EncodeResult(2 * items.size() + (prefix == null ? 1 : 2)); - encodeAll(result, items, prefix); - if(result.versionPos < 0) { + EncodeState state = new EncodeState(2 * items.size() + (prefix == null ? 1 : 2)); + encodeAll(state, items, prefix); + if(state.versionPos < 0) { throw new IllegalArgumentException("No incomplete Versionstamp included in tuple packInternal with versionstamp"); - } else { - if(result.versionPos > 0xffff) { - throw new IllegalArgumentException("Tuple has incomplete version at position " + result.versionPos + " which is greater than the maximum " + 0xffff); + } + else { + if(state.versionPos > 0xffff) { + throw new IllegalArgumentException("Tuple has incomplete version at position " + state.versionPos + " which is greater than the maximum " + 0xffff); } if (FDB.instance().getAPIVersion() < 520) { - result.add(ByteBuffer.allocate(Short.BYTES).order(ByteOrder.LITTLE_ENDIAN).putShort((short)result.versionPos).array()); + state.add(ByteBuffer.allocate(Short.BYTES).order(ByteOrder.LITTLE_ENDIAN).putShort((short)state.versionPos).array()); } else { - result.add(ByteBuffer.allocate(Integer.BYTES).order(ByteOrder.LITTLE_ENDIAN).putInt(result.versionPos).array()); + state.add(ByteBuffer.allocate(Integer.BYTES).order(ByteOrder.LITTLE_ENDIAN).putInt(state.versionPos).array()); } - return ByteArrayUtil.join(null, result.encodedValues); + return ByteArrayUtil.join(null, state.encodedValues); } } @@ -631,13 +659,17 @@ class TupleUtil { return items.anyMatch(item -> { if(item == null) { return false; - } else if(item instanceof Versionstamp) { + } + else if(item instanceof Versionstamp) { return !((Versionstamp) item).isComplete(); - } else if(item instanceof Tuple) { + } + else if(item instanceof Tuple) { return hasIncompleteVersionstamp(((Tuple) item).stream()); - } else if(item instanceof Collection) { + } + else if(item instanceof Collection) { return hasIncompleteVersionstamp(((Collection) item).stream()); - } else { + } + else { return false; } }); @@ -646,23 +678,25 @@ class TupleUtil { public static void main(String[] args) { try { byte[] bytes = pack(Collections.singletonList(4), null); - DecodeResult result = new DecodeResult(); + DecodeState result = new DecodeState(); decode(result, bytes, 0, bytes.length); int val = (int)result.values.get(0); assert 4 == val; - } catch (Exception e) { + } + catch(Exception e) { e.printStackTrace(); System.out.println("Error " + e.getMessage()); } try { byte[] bytes = pack(Collections.singletonList("\u021Aest \u0218tring"), null); - DecodeResult result = new DecodeResult(); + DecodeState result = new DecodeState(); decode(result, bytes, 0, bytes.length); String string = (String)result.values.get(0); System.out.println("contents -> " + string); assert "\u021Aest \u0218tring".equals(string); - } catch (Exception e) { + } + catch(Exception e) { e.printStackTrace(); System.out.println("Error " + e.getMessage()); } From a74dfa548782da90c87f11b30b6cd087d843efd1 Mon Sep 17 00:00:00 2001 From: Alec Grieser Date: Sun, 24 Feb 2019 23:49:31 -0800 Subject: [PATCH 26/46] compare strings by unicode codepoint without copying --- bindings/java/CMakeLists.txt | 1 + .../apple/foundationdb/tuple/StringUtil.java | 75 ++++++++++++++++ .../apple/foundationdb/tuple/TupleUtil.java | 8 +- .../test/TuplePerformanceTest.java | 55 +++++++----- .../apple/foundationdb/test/TupleTest.java | 85 ++++++++++++++++++- 5 files changed, 201 insertions(+), 23 deletions(-) create mode 100644 bindings/java/src/main/com/apple/foundationdb/tuple/StringUtil.java diff --git a/bindings/java/CMakeLists.txt b/bindings/java/CMakeLists.txt index 93e7e7ea8e..f8c1c25a65 100644 --- a/bindings/java/CMakeLists.txt +++ b/bindings/java/CMakeLists.txt @@ -56,6 +56,7 @@ set(JAVA_BINDING_SRCS src/main/com/apple/foundationdb/tuple/package-info.java src/main/com/apple/foundationdb/tuple/Tuple.java src/main/com/apple/foundationdb/tuple/TupleUtil.java + src/main/com/apple/foundationdb/tuple/StringUtil.java src/main/com/apple/foundationdb/tuple/Versionstamp.java) set(JAVA_TESTS_SRCS diff --git a/bindings/java/src/main/com/apple/foundationdb/tuple/StringUtil.java b/bindings/java/src/main/com/apple/foundationdb/tuple/StringUtil.java new file mode 100644 index 0000000000..660d04a6e1 --- /dev/null +++ b/bindings/java/src/main/com/apple/foundationdb/tuple/StringUtil.java @@ -0,0 +1,75 @@ +/* + * StringUtil.java + * + * This source file is part of the FoundationDB open source project + * + * Copyright 2013-2018 Apple Inc. and the FoundationDB project authors + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package com.apple.foundationdb.tuple; + +final class StringUtil { + private static final char SURROGATE_COUNT = Character.MAX_LOW_SURROGATE - Character.MIN_HIGH_SURROGATE + 1; + private static final char ABOVE_SURROGATES = Character.MAX_VALUE - Character.MAX_LOW_SURROGATE; + + static char adjustForSurrogates(char c, String s, int pos) { + if(c > Character.MAX_LOW_SURROGATE) { + return (char)(c - SURROGATE_COUNT); + } + else { + // Validate the UTF-16 string as this can do weird things on invalid strings + if((Character.isHighSurrogate(c) && (pos + 1 >= s.length() || !Character.isLowSurrogate(s.charAt(pos + 1)))) || + (Character.isLowSurrogate(c) && (pos == 0 || !Character.isHighSurrogate(s.charAt(pos - 1))))) { + throw new IllegalArgumentException("malformed UTF-16 string does not follow high surrogate with low surrogate"); + } + return (char)(c + ABOVE_SURROGATES); + + } + } + + // Compare two strings based on their UTF-8 code point values. Note that Java stores strings + // using UTF-16. However, {@link Tuple}s are encoded using UTF-8. Using unsigned byte comparison, + // UTF-8 strings will sort based on their Unicode codepoints. However, UTF-16 strings almost, + // but not quite, sort that way. This can be addressed by fixing up surrogates. There are 0x800 surrogate + // values and about 0x2000 code points above the maximum surrogate value. For anything that is a surrogate, + // shift it up by 0x2000, and anything that is above the maximum surrogate value, shift it down by 0x800. + // This makes all surrogates sort after all non-surrogates. + // + // See: https://ssl.icu-project.org/docs/papers/utf16_code_point_order.html + static int compareUtf8(String s1, String s2) { + // Ignore common prefix at the beginning which will compare equal regardless of encoding + int pos = 0; + while(pos < s1.length() && pos < s2.length() && s1.charAt(pos) == s2.charAt(pos)) { + pos++; + } + if(pos >= s1.length() || pos >= s2.length()) { + // One string is the prefix of another, so return based on length. + return Integer.compare(s1.length(), s2.length()); + } + // Compare first different character + char c1 = s1.charAt(pos); + char c2 = s2.charAt(pos); + // Apply "fix up" for surrogates + if(c1 >= Character.MIN_HIGH_SURROGATE) { + c1 = adjustForSurrogates(c1, s1, pos); + } + if(c2 >= Character.MIN_HIGH_SURROGATE) { + c2 = adjustForSurrogates(c2, s2, pos); + } + return Character.compare(c1, c2); + } + + private StringUtil() {} +} diff --git a/bindings/java/src/main/com/apple/foundationdb/tuple/TupleUtil.java b/bindings/java/src/main/com/apple/foundationdb/tuple/TupleUtil.java index 5b220d2c90..34d0f78653 100644 --- a/bindings/java/src/main/com/apple/foundationdb/tuple/TupleUtil.java +++ b/bindings/java/src/main/com/apple/foundationdb/tuple/TupleUtil.java @@ -546,7 +546,13 @@ class TupleUtil { return ByteArrayUtil.compareUnsigned((byte[])item1, (byte[])item2); } if(code1 == STRING_CODE) { - return ByteArrayUtil.compareUnsigned(((String)item1).getBytes(UTF8), ((String)item2).getBytes(UTF8)); + try { + return StringUtil.compareUtf8((String)item1, (String)item2); + } + catch(IllegalArgumentException e) { + // Encountered malformed unicode when comparing. Use byte comparison. + return ByteArrayUtil.compareUnsigned(((String)item1).getBytes(UTF8), ((String)item2).getBytes(UTF8)); + } } if(code1 == INT_ZERO_CODE) { if(item1 instanceof Long && item2 instanceof Long) { diff --git a/bindings/java/src/test/com/apple/foundationdb/test/TuplePerformanceTest.java b/bindings/java/src/test/com/apple/foundationdb/test/TuplePerformanceTest.java index cf79ff41a9..3de9b76785 100644 --- a/bindings/java/src/test/com/apple/foundationdb/test/TuplePerformanceTest.java +++ b/bindings/java/src/test/com/apple/foundationdb/test/TuplePerformanceTest.java @@ -142,6 +142,7 @@ public class TuplePerformanceTest { long packNanos = 0L; long unpackNanos = 0L; long equalsNanos = 0L; + long equalsArrayNanos = 0L; long hashNanos = 0L; long secondHashNanos = 0L; long subspacePackNanos = 0L; @@ -164,12 +165,22 @@ public class TuplePerformanceTest { endNanos = System.nanoTime(); unpackNanos += endNanos - startNanos; + // Copy items over as if both are packed, their byte arrays are compared + Tuple tCopy = Tuple.fromList(t.getItems()); + Tuple t2Copy = Tuple.fromList(t2.getItems()); + startNanos = System.nanoTime(); + if (!tCopy.equals(t2Copy)) { + throw new RuntimeException("deserialized did not match serialized: " + t + " -- " + t2); + } + endNanos = System.nanoTime(); + equalsNanos += endNanos - startNanos; + startNanos = System.nanoTime(); if(!t.equals(t2)) { throw new RuntimeException("deserialized did not match serialized: " + t + " -- " + t2); } endNanos = System.nanoTime(); - equalsNanos += endNanos - startNanos; + equalsArrayNanos += endNanos - startNanos; startNanos = System.nanoTime(); byte[] subspacePacked = subspace.pack(t); @@ -182,7 +193,7 @@ public class TuplePerformanceTest { startNanos = System.nanoTime(); Tuple t3 = subspace.unpack(subspacePacked); endNanos = System.nanoTime(); - if(!t.equals(t3)) { + if (!Tuple.fromList(t.getItems()).equals(Tuple.fromList(t3.getItems())) || !t.equals(t3)) { throw new RuntimeException("does not unpack equally from subspace"); } if(!Arrays.equals(t.pack(), t3.pack())) { @@ -205,25 +216,27 @@ public class TuplePerformanceTest { } System.out.println("Test ended."); - System.out.printf(" Total elements: %d%n", totalLength); - System.out.printf(" Total bytes: %d kB%n", totalBytes / 1000); - System.out.printf(" Bytes per tuple: %f B%n", totalBytes * 1.0 / iterations); - System.out.printf(" Pack time: %f s%n", packNanos * 1e-9); - System.out.printf(" Pack time per tuple: %f \u03BCs%n", packNanos * 1e-3 / iterations); - System.out.printf(" Pack time per kB: %f \u03BCs%n", packNanos * 1.0 / totalBytes); - System.out.printf(" Serialization rate: %f objects / \u03BCs%n", totalLength * 1000.0 / packNanos); - System.out.printf(" Unpack time: %f s%n", unpackNanos * 1e-9); - System.out.printf(" Unpack time per tuple: %f \u03BCs%n", unpackNanos * 1e-3 / iterations); - System.out.printf(" Equals time: %f s%n", equalsNanos * 1e-9); - System.out.printf(" Equals time per tuple: %f \u03BCs%n", equalsNanos * 1e-3 / iterations); - System.out.printf(" Subspace pack time: %f s%n", subspacePackNanos * 1e-9); - System.out.printf(" Subspace pack time per tuple: %f \u03BCs%n", subspacePackNanos * 1e-3 / iterations); - System.out.printf(" Subspace unpack time: %f s%n", subspaceUnpackNanos * 1e-9); - System.out.printf(" Subspace unpack time per tuple: %f \u03BCs%n", subspaceUnpackNanos * 1e-3 / iterations); - System.out.printf(" Hash time: %f s%n", hashNanos * 1e-9); - System.out.printf(" Hash time per tuple: %f \u03BCs%n", hashNanos * 1e-3 / iterations); - System.out.printf(" Second hash time: %f s%n", secondHashNanos * 1e-9); - System.out.printf(" Second hash time per tuple: %f \u03BCs%n", secondHashNanos * 1e-3 / iterations); + System.out.printf(" Total elements: %d%n", totalLength); + System.out.printf(" Total bytes: %d kB%n", totalBytes / 1000); + System.out.printf(" Bytes per tuple: %f B%n", totalBytes * 1.0 / iterations); + System.out.printf(" Pack time: %f s%n", packNanos * 1e-9); + System.out.printf(" Pack time per tuple: %f \u03BCs%n", packNanos * 1e-3 / iterations); + System.out.printf(" Pack time per kB: %f \u03BCs%n", packNanos * 1.0 / totalBytes); + System.out.printf(" Serialization rate: %f objects / \u03BCs%n", totalLength * 1000.0 / packNanos); + System.out.printf(" Unpack time: %f s%n", unpackNanos * 1e-9); + System.out.printf(" Unpack time per tuple: %f \u03BCs%n", unpackNanos * 1e-3 / iterations); + System.out.printf(" Equals time: %f s%n", equalsNanos * 1e-9); + System.out.printf(" Equals time per tuple: %f \u03BCs%n", equalsNanos * 1e-3 / iterations); + System.out.printf(" Equals time (using packed): %f s%n", equalsArrayNanos * 1e-9); + System.out.printf(" Equals time (using packed) per tuple: %f \u03BCs%n", equalsArrayNanos * 1e-3 / iterations); + System.out.printf(" Subspace pack time: %f s%n", subspacePackNanos * 1e-9); + System.out.printf(" Subspace pack time per tuple: %f \u03BCs%n", subspacePackNanos * 1e-3 / iterations); + System.out.printf(" Subspace unpack time: %f s%n", subspaceUnpackNanos * 1e-9); + System.out.printf(" Subspace unpack time per tuple: %f \u03BCs%n", subspaceUnpackNanos * 1e-3 / iterations); + System.out.printf(" Hash time: %f s%n", hashNanos * 1e-9); + System.out.printf(" Hash time per tuple: %f \u03BCs%n", hashNanos * 1e-3 / iterations); + System.out.printf(" Second hash time: %f s%n", secondHashNanos * 1e-9); + System.out.printf(" Second hash time per tuple: %f \u03BCs%n", secondHashNanos * 1e-3 / iterations); } public static void main(String[] args) { diff --git a/bindings/java/src/test/com/apple/foundationdb/test/TupleTest.java b/bindings/java/src/test/com/apple/foundationdb/test/TupleTest.java index 528c11f93a..305c1a90f0 100644 --- a/bindings/java/src/test/com/apple/foundationdb/test/TupleTest.java +++ b/bindings/java/src/test/com/apple/foundationdb/test/TupleTest.java @@ -29,6 +29,7 @@ import java.util.ArrayList; import java.util.Arrays; import java.util.List; import java.util.Objects; +import java.util.UUID; public class TupleTest { private static final byte FF = (byte)0xff; @@ -38,6 +39,7 @@ public class TupleTest { try { // FDB fdb = FDB.selectAPIVersion(610); serializedForms(); + comparisons(); /* try(Database db = fdb.open()) { runTests(reps, db); @@ -113,7 +115,16 @@ public class TupleTest { Tuple.from(Float.intBitsToFloat(Integer.MAX_VALUE)), new byte[]{0x20, FF, FF, FF, FF}, Tuple.from(Double.longBitsToDouble(Long.MAX_VALUE)), new byte[]{0x21, FF, FF, FF, FF, FF, FF, FF, FF}, Tuple.from(Float.intBitsToFloat(~0)), new byte[]{0x20, 0x00, 0x00, 0x00, 0x00}, - Tuple.from(Double.longBitsToDouble(~0L)), new byte[]{0x21, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00} + Tuple.from(Double.longBitsToDouble(~0L)), new byte[]{0x21, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00}, + Tuple.from(""), new byte[]{0x02, 0x00}, + Tuple.from("hello"), new byte[]{0x02, 'h', 'e', 'l', 'l', 'o', 0x00}, + Tuple.from("\u4e2d\u6587"), new byte[]{0x02, (byte)0xe4, (byte)0xb8, (byte)0xad, (byte)0xe6, (byte)0x96, (byte)0x87, 0x00}, + Tuple.from("\u03bc\u03ac\u03b8\u03b7\u03bc\u03b1"), new byte[]{0x02, (byte)0xce, (byte)0xbc, (byte)0xce, (byte)0xac, (byte)0xce, (byte)0xb8, (byte)0xce, (byte)0xb7, (byte)0xce, (byte)0xbc, (byte)0xce, (byte)0xb1, 0x00}, + Tuple.from(new String(new int[]{0x1f525}, 0, 1)), new byte[]{0x02, (byte)0xf0, (byte)0x9f, (byte)0x94, (byte)0xa5, 0x00}, + Tuple.from("\ud83d\udd25"), new byte[]{0x02, (byte)0xf0, (byte)0x9f, (byte)0x94, (byte)0xa5, 0x00}, + Tuple.from("\ud83e\udd6f"), new byte[]{0x02, (byte)0xf0, (byte)0x9f, (byte)0xa5, (byte)0xaf, 0x00}, + Tuple.from("\udd25\ud83e\udd6f"), new byte[]{0x02, 0x3f, (byte)0xf0, (byte)0x9f, (byte)0xa5, (byte)0xaf, 0x00}, // malformed string - low surrogate without high surrogate + Tuple.from("a\udd25\ud83e\udd6f"), new byte[]{0x02, 'a', 0x3f, (byte)0xf0, (byte)0x9f, (byte)0xa5, (byte)0xaf, 0x00} // malformed string - low surrogate without high surrogate ); for(TupleSerialization serialization : serializations) { @@ -130,6 +141,78 @@ public class TupleTest { System.out.println("All tuples had matching serializations"); } + private static void comparisons() { + List tuples = Arrays.asList( + Tuple.from(0L), + Tuple.from(BigInteger.ZERO), + Tuple.from(1L), + Tuple.from(BigInteger.ONE), + Tuple.from(-1L), + Tuple.from(BigInteger.ONE.negate()), + Tuple.from(Long.MAX_VALUE), + Tuple.from(Long.MIN_VALUE), + Tuple.from(BigInteger.valueOf(Long.MIN_VALUE).subtract(BigInteger.ONE)), + Tuple.from(BigInteger.valueOf(Long.MIN_VALUE).shiftLeft(1)), + Tuple.from(-0.0f), + Tuple.from(0.0f), + Tuple.from(-0.0), + Tuple.from(0.0), + Tuple.from(Float.NEGATIVE_INFINITY), + Tuple.from(Double.NEGATIVE_INFINITY), + Tuple.from(Float.NaN), + Tuple.from(Double.NaN), + Tuple.from(Float.intBitsToFloat(Float.floatToIntBits(Float.NaN) + 1)), + Tuple.from(Double.longBitsToDouble(Double.doubleToLongBits(Double.NaN) + 1)), + Tuple.from(Float.intBitsToFloat(Float.floatToIntBits(Float.NaN) + 2)), + Tuple.from(Double.longBitsToDouble(Double.doubleToLongBits(Double.NaN) + 2)), + Tuple.from(Float.intBitsToFloat(Float.floatToIntBits(Float.NaN) ^ Integer.MIN_VALUE)), + Tuple.from(Double.longBitsToDouble(Double.doubleToLongBits(Double.NaN) ^ Long.MIN_VALUE)), + Tuple.from(Float.intBitsToFloat(Float.floatToIntBits(Float.NaN) ^ Integer.MIN_VALUE + 1)), + Tuple.from(Double.longBitsToDouble(Double.doubleToLongBits(Double.NaN) ^ Long.MIN_VALUE + 1)), + Tuple.from(Float.POSITIVE_INFINITY), + Tuple.from(Double.POSITIVE_INFINITY), + Tuple.from((Object)new byte[0]), + Tuple.from((Object)new byte[]{0x00}), + Tuple.from((Object)new byte[]{0x00, FF}), + Tuple.from((Object)new byte[]{0x7f}), + Tuple.from((Object)new byte[]{(byte)0x80}), + Tuple.from("a"), + Tuple.from("\u03bc\u03ac\u03b8\u03b7\u03bc\u03b1"), + Tuple.from("\u03bc\u03b1\u0301\u03b8\u03b7\u03bc\u03b1"), + Tuple.from("\u4e2d\u6587"), + Tuple.from("\u4e2d\u570B"), + Tuple.from("\ud83d\udd25"), + Tuple.from("\ud83e\udd6f"), + Tuple.from("a\ud83d\udd25"), + Tuple.from("\ufb49"), + Tuple.from("\ud83d\udd25\ufb49"), + Tuple.from("\ud8ed\ud8ed"), // malformed string -- two high surrogates + Tuple.from("\ud8ed\ud8eda"), // malformed string -- two high surrogates + Tuple.from("\udd25\udd25"), // malformed string -- two low surrogates + Tuple.from("a\udd25\ud8ed"), // malformed string -- two low surrogates + Tuple.from("\udd25\ud83e\udd6f"), // malformed string -- low surrogate followed by high then low surrogate + Tuple.from("\udd6f\ud83e\udd6f"), // malformed string -- low surrogate followed by high then low surrogate + Tuple.from(new UUID(-1, 0)), + Tuple.from(new UUID(-1, -1)), + Tuple.from(new UUID(1, -1)), + Tuple.from(new UUID(1, 1)) + ); + + for(Tuple t1 : tuples) { + for(Tuple t2 : tuples) { + System.out.println("Comparing " + t1 + " and " + t2); + // Copy the items over to new tuples to avoid having them use the memoized packed representations + Tuple t1copy = Tuple.fromList(t1.getItems()); + Tuple t2copy = Tuple.fromList(t2.getItems()); + int semanticComparison = t1copy.compareTo(t2copy); + int byteComparison = ByteArrayUtil.compareUnsigned(t1.pack(), t2.pack()); + if(Integer.signum(semanticComparison) != Integer.signum(byteComparison)) { + throw new RuntimeException("Tuple t1 and t2 comparison mismatched: semantic = " + semanticComparison + " while byte order = " + byteComparison); + } + } + } + } + private static void runTests(final int reps, TransactionContext db) { System.out.println("Running tests..."); long start = System.currentTimeMillis(); From 663d750e1de2ceb1a2d8fd78ab5c511eeec37fd9 Mon Sep 17 00:00:00 2001 From: Alec Grieser Date: Mon, 25 Feb 2019 21:59:16 -0800 Subject: [PATCH 27/46] pack Tuples with a single byte array allocation of the right size --- .../foundationdb/tuple/ByteArrayUtil.java | 143 +++---- .../apple/foundationdb/tuple/StringUtil.java | 43 +++ .../com/apple/foundationdb/tuple/Tuple.java | 56 +-- .../apple/foundationdb/tuple/TupleUtil.java | 357 +++++++++++------- .../foundationdb/test/AsyncStackTester.java | 6 +- .../apple/foundationdb/test/StackTester.java | 8 +- .../test/TuplePerformanceTest.java | 64 +++- .../apple/foundationdb/test/TupleTest.java | 107 +++++- 8 files changed, 547 insertions(+), 237 deletions(-) diff --git a/bindings/java/src/main/com/apple/foundationdb/tuple/ByteArrayUtil.java b/bindings/java/src/main/com/apple/foundationdb/tuple/ByteArrayUtil.java index eeea3e1799..d848c296ff 100644 --- a/bindings/java/src/main/com/apple/foundationdb/tuple/ByteArrayUtil.java +++ b/bindings/java/src/main/com/apple/foundationdb/tuple/ByteArrayUtil.java @@ -20,7 +20,6 @@ package com.apple.foundationdb.tuple; -import java.math.BigInteger; import java.nio.ByteBuffer; import java.nio.ByteOrder; import java.util.Arrays; @@ -154,7 +153,10 @@ public class ByteArrayUtil { * @return a newly created array where {@code pattern} replaced with {@code replacement} */ public static byte[] replace(byte[] src, byte[] pattern, byte[] replacement) { - return join(replacement, split(src, pattern)); + if(src == null) { + return null; + } + return replace(src, 0, src.length, pattern, replacement); } /** @@ -171,7 +173,69 @@ public class ByteArrayUtil { */ public static byte[] replace(byte[] src, int offset, int length, byte[] pattern, byte[] replacement) { - return join(replacement, split(src, offset, length, pattern)); + if(pattern == null || pattern.length == 0) { + return Arrays.copyOfRange(src, offset, offset + length); + } + ByteBuffer dest; + if(replacement == null || replacement.length != pattern.length) { + // Array might change size. This is the "tricky" case. + byte patternFirst = pattern[0]; + int patternOccurrences = 0; + int currentPosition = offset; + while(currentPosition < offset + length) { + if(src[currentPosition] == patternFirst && regionEquals(src, currentPosition, pattern)) { + patternOccurrences++; + currentPosition += pattern.length; + } + else { + currentPosition++; + } + } + if(patternOccurrences == 0) { + // Pattern doesn't occur. Just return a copy of the needed region. + return Arrays.copyOfRange(src, offset, offset + length); + } + int replacementLength = (replacement == null) ? 0 : replacement.length; + int newLength = length + patternOccurrences * (replacementLength - pattern.length); + if(newLength == 0) { + return new byte[0]; + } + else { + dest = ByteBuffer.allocate(newLength); + } + } + else { + // No matter what, the array will stay the same size as replacement.length = pattern.length + dest = ByteBuffer.allocate(length); + } + replace(src, offset, length, pattern, replacement, dest); + return dest.array(); + } + + static void replace(byte[] src, int offset, int length, byte[] pattern, byte[] replacement, ByteBuffer dest) { + if(pattern == null || pattern.length == 0) { + dest.put(src, offset, length); + return; + } + byte patternFirst = pattern[0]; + int lastPosition = offset; + int currentPosition = offset; + + while(currentPosition < offset + length) { + if(src[currentPosition] == patternFirst && regionEquals(src, currentPosition, pattern)) { + dest.put(src, lastPosition, currentPosition - lastPosition); + if(replacement != null) { + dest.put(replacement); + } + currentPosition += pattern.length; + lastPosition = currentPosition; + } + else { + currentPosition++; + } + } + + dest.put(src, lastPosition, currentPosition - lastPosition); } /** @@ -203,7 +267,7 @@ public class ByteArrayUtil { * @return a list of byte arrays from {@code src} now not containing {@code delimiter} */ public static List split(byte[] src, int offset, int length, byte[] delimiter) { - List parts = new LinkedList(); + List parts = new LinkedList<>(); int idx = offset; int lastSplitEnd = offset; while(idx <= (offset+length) - delimiter.length) { @@ -225,13 +289,6 @@ public class ByteArrayUtil { return parts; } - static int bisectLeft(BigInteger[] arr, BigInteger i) { - int n = Arrays.binarySearch(arr, i); - if(n >= 0) - return n; - return (n + 1) * -1; - } - /** * Compare byte arrays for equality and ordering purposes. Elements in the array * are interpreted and compared as unsigned bytes. Neither parameter @@ -276,61 +333,6 @@ public class ByteArrayUtil { return true; } - /** - * Scan through an array of bytes to find the first occurrence of a specific value. - * - * @param src array to scan. Must not be {@code null}. - * @param what the value for which to search. - * @param start the index at which to start the search. If this is at or after - * the end of {@code src}, the result will always be {@code -1}. - * @param end the index one past the last entry at which to search - * - * @return return the location of the first instance of {@code value}, or - * {@code -1} if not found. - */ - static int findNext(byte[] src, byte what, int start, int end) { - for(int i = start; i < end; i++) { - if(src[i] == what) - return i; - } - return -1; - } - - /** - * Gets the index of the first element after the next occurrence of the byte sequence [nm] - * @param v the bytes to scan through - * @param n first character to find - * @param m second character to find - * @param start the index at which to start the scan - * - * @return the index after the next occurrence of [nm] - */ - static int findTerminator(byte[] v, byte n, byte m, int start) { - return findTerminator(v, n, m, start, v.length); - } - - /** - * Gets the index of the first element after the next occurrence of the byte sequence [nm] - * @param v the bytes to scan through - * @param n first character to find - * @param m second character to find - * @param start the index at which to start the scan - * @param end the index at which to stop the search (exclusive) - * - * @return the index after the next occurrence of [nm] - */ - static int findTerminator(byte[] v, byte n, byte m, int start, int end) { - int pos = start; - while(true) { - pos = findNext(v, n, pos, end); - if(pos < 0) - return end; - if(pos + 1 == end || v[pos+1] != m) - return pos; - pos += 2; - } - } - /** * Computes the first key that would sort outside the range prefixed by {@code key}. * {@code key} must be non-null, and contain at least some character this is not @@ -417,5 +419,14 @@ public class ByteArrayUtil { return s.toString(); } + static int nullCount(byte[] val) { + int nulls = 0; + for(int i = 0; i < val.length; i++) { + if(val[i] == 0x00) + nulls += 1; + } + return nulls; + } + private ByteArrayUtil() {} } diff --git a/bindings/java/src/main/com/apple/foundationdb/tuple/StringUtil.java b/bindings/java/src/main/com/apple/foundationdb/tuple/StringUtil.java index 660d04a6e1..cd1d18d627 100644 --- a/bindings/java/src/main/com/apple/foundationdb/tuple/StringUtil.java +++ b/bindings/java/src/main/com/apple/foundationdb/tuple/StringUtil.java @@ -71,5 +71,48 @@ final class StringUtil { return Character.compare(c1, c2); } + static int packedSize(String s) { + final int strLength = s.length(); + int size = 0; + int pos = 0; + + while(pos < strLength) { + char c = s.charAt(pos); + if(c == '\0') { + // Null is encoded as \x00\xff + size += 2; + } + else if(c <= 0x7f) { + // ASCII code point. Only 1 byte. + size += 1; + } + else if(c <= 0x07ff) { + // 2 byte code point + size += 2; + } + else if(Character.isHighSurrogate(c)) { + if(pos + 1 < s.length() && Character.isLowSurrogate(s.charAt(pos + 1))) { + // High surrogate followed by low surrogate means the code point + // is between U+10000 and U+10FFFF, so it requires 4 bytes. + size += 4; + pos += 1; + } + else { + throw new IllegalArgumentException("malformed UTF-16 has high surrogate not followed by low surrogate"); + } + } + else if(Character.isLowSurrogate(c)) { + throw new IllegalArgumentException("malformed UTF-16 has low surrogate without prior high surrogate"); + } + else { + // 3 byte code point + size += 3; + } + pos += 1; + } + + return size; + } + private StringUtil() {} } diff --git a/bindings/java/src/main/com/apple/foundationdb/tuple/Tuple.java b/bindings/java/src/main/com/apple/foundationdb/tuple/Tuple.java index b3761d8c5d..5fa9726c14 100644 --- a/bindings/java/src/main/com/apple/foundationdb/tuple/Tuple.java +++ b/bindings/java/src/main/com/apple/foundationdb/tuple/Tuple.java @@ -73,6 +73,7 @@ public class Tuple implements Comparable, Iterable { private List elements; private int memoizedHash = 0; private byte[] packed = null; + private int memoizedPackedSize = -1; private Tuple(List elements, Object newItem) { this(elements); @@ -83,12 +84,6 @@ public class Tuple implements Comparable, Iterable { this.elements = new ArrayList<>(elements); } - private enum VersionstampExpectations { - UNKNOWN, - HAS_INCOMPLETE, - HAS_NO_INCOMPLETE - } - /** * Creates a copy of this {@code Tuple} with an appended last element. The parameter * is untyped but only {@link String}, {@code byte[]}, {@link Number}s, {@link UUID}s, @@ -313,13 +308,15 @@ public class Tuple implements Comparable, Iterable { byte[] packInternal(byte[] prefix, boolean copy) { boolean hasPrefix = prefix != null && prefix.length > 1; if(packed == null) { - byte[] result = TupleUtil.pack(elements, prefix); + byte[] result = TupleUtil.pack(elements, prefix, getPackedSize()); if(hasPrefix) { packed = Arrays.copyOfRange(result, prefix.length, result.length); + memoizedPackedSize = packed.length; return result; } else { packed = result; + memoizedPackedSize = packed.length; } } if(hasPrefix) { @@ -366,21 +363,23 @@ public class Tuple implements Comparable, Iterable { * @throws IllegalArgumentException if there is not exactly one incomplete {@link Versionstamp} included in this {@code Tuple} */ public byte[] packWithVersionstamp(byte[] prefix) { - return TupleUtil.packWithVersionstamp(elements, prefix); + return TupleUtil.packWithVersionstamp(elements, prefix, getPackedSize()); } byte[] packWithVersionstampInternal(byte[] prefix, boolean copy) { boolean hasPrefix = prefix != null && prefix.length > 0; if(packed == null) { - byte[] result = TupleUtil.packWithVersionstamp(elements, prefix); + byte[] result = TupleUtil.packWithVersionstamp(elements, prefix, getPackedSize()); if(hasPrefix) { byte[] withoutPrefix = Arrays.copyOfRange(result, prefix.length, result.length); TupleUtil.adjustVersionPosition(packed, -1 * prefix.length); packed = withoutPrefix; + memoizedPackedSize = packed.length; return result; } else { packed = result; + memoizedPackedSize = packed.length; } } if(hasPrefix) { @@ -398,13 +397,13 @@ public class Tuple implements Comparable, Iterable { } } - byte[] packMaybeVersionstamp(byte[] prefix) { + byte[] packMaybeVersionstamp() { if(packed == null) { if(hasIncompleteVersionstamp()) { - return packWithVersionstampInternal(prefix, false); + return packWithVersionstampInternal(null, false); } else { - return packInternal(prefix, false); + return packInternal(null, false); } } else { @@ -489,6 +488,7 @@ public class Tuple implements Comparable, Iterable { Tuple t = new Tuple(); t.elements = TupleUtil.unpack(bytes, offset, length); t.packed = Arrays.copyOfRange(bytes, offset, offset + length); + t.memoizedPackedSize = length; return t; } @@ -727,11 +727,14 @@ public class Tuple implements Comparable, Iterable { Object o = this.elements.get(index); if(o == null) { return null; - } else if(o instanceof Tuple) { + } + else if(o instanceof Tuple) { return (Tuple)o; - } else if(o instanceof List) { - return Tuple.fromItems((List)o); - } else { + } + else if(o instanceof List) { + return Tuple.fromItems((List)o); + } + else { throw new ClassCastException("Cannot convert item of type " + o.getClass() + " to tuple"); } } @@ -824,16 +827,23 @@ public class Tuple implements Comparable, Iterable { } /** - * Get the number of bytes in the packed representation of this {@code Tuple}. Note that at the - * moment, this number is calculated by packing the {@code Tuple} and looking at its size. This method - * will memoize the result, however, so asking the same {@code Tuple} for its size multiple times - * is a fast operation. + * Get the number of bytes in the packed representation of this {@code Tuple}. This is done by summing + * the serialized sizes of all of the elements of this {@code Tuple} and does not pack everything + * into a single {@code Tuple}. The return value of this function is stored within this {@code Tuple} + * after this function has been called so that subsequent calls on the same object are fast. This method + * does not validate that there is no more than one incomplete {@link Versionstamp} in this {@code Tuple}. * * @return the number of bytes in the packed representation of this {@code Tuple} */ public int getPackedSize() { - byte[] p = packMaybeVersionstamp(null); - return p.length; + if(memoizedPackedSize < 0) { + memoizedPackedSize = getPackedSize(false); + } + return memoizedPackedSize; + } + + int getPackedSize(boolean nested) { + return TupleUtil.getPackedSize(elements, nested); } /** @@ -871,7 +881,7 @@ public class Tuple implements Comparable, Iterable { @Override public int hashCode() { if(memoizedHash == 0) { - memoizedHash = Arrays.hashCode(packMaybeVersionstamp(null)); + memoizedHash = Arrays.hashCode(packMaybeVersionstamp()); } return memoizedHash; } diff --git a/bindings/java/src/main/com/apple/foundationdb/tuple/TupleUtil.java b/bindings/java/src/main/com/apple/foundationdb/tuple/TupleUtil.java index 34d0f78653..fc1fbc7262 100644 --- a/bindings/java/src/main/com/apple/foundationdb/tuple/TupleUtil.java +++ b/bindings/java/src/main/com/apple/foundationdb/tuple/TupleUtil.java @@ -36,11 +36,10 @@ import com.apple.foundationdb.FDB; class TupleUtil { private static final byte nil = 0x00; - private static final BigInteger[] BIG_INT_SIZE_LIMITS; - private static final Charset UTF8; + private static final Charset UTF8 = Charset.forName("UTF-8"); private static final BigInteger LONG_MIN_VALUE = BigInteger.valueOf(Long.MIN_VALUE); private static final BigInteger LONG_MAX_VALUE = BigInteger.valueOf(Long.MAX_VALUE); - private static final IterableComparator iterableComparator; + private static final IterableComparator iterableComparator = new IterableComparator(); private static final byte BYTES_CODE = 0x01; private static final byte STRING_CODE = 0x02; @@ -57,26 +56,11 @@ class TupleUtil { private static final byte[] NULL_ARR = new byte[] {nil}; private static final byte[] NULL_ESCAPED_ARR = new byte[] {nil, (byte)0xFF}; - private static final byte[] BYTES_ARR = new byte[]{BYTES_CODE}; - private static final byte[] STRING_ARR = new byte[]{STRING_CODE}; - private static final byte[] NESTED_ARR = new byte[]{NESTED_CODE}; - private static final byte[] INT_ZERO_ARR = new byte[]{INT_ZERO_CODE}; - private static final byte[] FALSE_ARR = new byte[]{FALSE_CODE}; - private static final byte[] TRUE_ARR = new byte[]{TRUE_CODE}; - private static final byte[] VERSIONSTAMP_ARR = new byte[]{VERSIONSTAMP_CODE}; - - static { - BIG_INT_SIZE_LIMITS = new BigInteger[9]; - for(int i = 0; i < BIG_INT_SIZE_LIMITS.length; i++) { - BIG_INT_SIZE_LIMITS[i] = (BigInteger.ONE).shiftLeft(i * 8).subtract(BigInteger.ONE); - } - UTF8 = Charset.forName("UTF-8"); - iterableComparator = new IterableComparator(); - } static class DecodeState { final List values; int end; + int nullCount; // Basically a hack to allow findTerminator to return the terminator and null count DecodeState() { values = new ArrayList<>(); @@ -87,15 +71,36 @@ class TupleUtil { values.add(value); this.end = end; } + + int findNullTerminator(byte[] bytes, int from, int to) { + nullCount = 0; + int x = from; + while(x < to) { + if(bytes[x] == 0x00) { + if(x + 1 >= to || bytes[x + 1] != (byte)0xFF) { + return x; + } + else { + nullCount++; + x += 2; + } + } + else { + x += 1; + } + } + throw new IllegalArgumentException("no terminator found for bytes starting at " + from); + } } static class EncodeState { - final List encodedValues; + final ByteBuffer encodedBytes; int totalLength; int versionPos; - EncodeState(int capacity) { - this.encodedValues = new ArrayList<>(capacity); + EncodeState(ByteBuffer dest) { + encodedBytes = dest; + encodedBytes.order(ByteOrder.BIG_ENDIAN); totalLength = 0; versionPos = -1; } @@ -104,25 +109,52 @@ class TupleUtil { if(versionPos >= 0 && this.versionPos >= 0) { throw new IllegalArgumentException("Multiple incomplete Versionstamps included in Tuple"); } - encodedValues.add(encoded); + encodedBytes.put(encoded); totalLength += encoded.length; this.versionPos = versionPos; return this; } EncodeState add(byte[] encoded) { - encodedValues.add(encoded); + encodedBytes.put(encoded); totalLength += encoded.length; return this; } - } - static int byteLength(byte[] bytes) { - for(int i = 0; i < bytes.length; i++) { - if(bytes[i] == 0x00) continue; - return bytes.length - i; + EncodeState add(byte[] encoded, int offset, int length) { + encodedBytes.put(encoded, offset, length); + totalLength += length; + return this; + } + + EncodeState addNullEscaped(byte[] encoded) { + int nullCount = ByteArrayUtil.nullCount(encoded); + if(nullCount == 0) { + encodedBytes.put(encoded); + } + else { + ByteArrayUtil.replace(encoded, 0, encoded.length, NULL_ARR, NULL_ESCAPED_ARR, encodedBytes); + } + return this; + } + + EncodeState add(byte b) { + encodedBytes.put(b); + totalLength++; + return this; + } + + EncodeState add(int i) { + encodedBytes.putInt(i); + totalLength += Integer.BYTES; + return this; + } + + EncodeState add(long l) { + encodedBytes.putLong(l); + totalLength += Long.BYTES; + return this; } - return 0; } // These four functions are for adjusting the encoding of floating point numbers so @@ -153,11 +185,16 @@ class TupleUtil { return Double.longBitsToDouble(origBits); } - // Get the number of bytes in the representation of a long. - static int byteCount(long i) { + // Get the minimal number of bytes in the representation of a long. + static int minimalByteCount(long i) { return (Long.SIZE + 7 - Long.numberOfLeadingZeros(i >= 0 ? i : -i)) / 8; } + static int minimalByteCount(BigInteger i) { + int bitLength = (i.compareTo(BigInteger.ZERO) >= 0) ? i.bitLength() : i.negate().bitLength(); + return (bitLength + 7) / 8; + } + private static void adjustVersionPosition300(byte[] packed, int delta) { int offsetOffset = packed.length - Short.BYTES; ByteBuffer buffer = ByteBuffer.wrap(packed, offsetOffset, Short.BYTES).order(ByteOrder.LITTLE_ENDIAN); @@ -224,7 +261,7 @@ class TupleUtil { state.add(NULL_ESCAPED_ARR); } else { - state.add(NULL_ARR); + state.add(nil); } } else if(t instanceof byte[]) @@ -258,133 +295,104 @@ class TupleUtil { } static void encode(EncodeState state, byte[] bytes) { - byte[] escaped = ByteArrayUtil.replace(bytes, NULL_ARR, NULL_ESCAPED_ARR); - state.add(BYTES_ARR).add(escaped).add(NULL_ARR); + state.add(BYTES_CODE).addNullEscaped(bytes).add(nil); } static void encode(EncodeState state, String s) { - byte[] escaped = ByteArrayUtil.replace(s.getBytes(UTF8), NULL_ARR, NULL_ESCAPED_ARR); - state.add(STRING_ARR).add(escaped).add(NULL_ARR); + byte[] bytes = s.getBytes(UTF8); + state.add(STRING_CODE).addNullEscaped(bytes).add(nil); } static void encode(EncodeState state, BigInteger i) { //System.out.println("Encoding integral " + i); if(i.equals(BigInteger.ZERO)) { - state.add(INT_ZERO_ARR); + state.add(INT_ZERO_CODE); return; } - byte[] bytes = i.toByteArray(); + int n = minimalByteCount(i); + if(n > 0xff) { + throw new IllegalArgumentException("BigInteger magnitude is too large (more than 255 bytes)"); + } if(i.compareTo(BigInteger.ZERO) > 0) { - if(i.compareTo(BIG_INT_SIZE_LIMITS[BIG_INT_SIZE_LIMITS.length-1]) > 0) { - int length = byteLength(bytes); - if(length > 0xff) { - throw new IllegalArgumentException("BigInteger magnitude is too large (more than 255 bytes)"); - } - byte[] intBytes = new byte[length + 2]; - intBytes[0] = POS_INT_END; - intBytes[1] = (byte)(length); - System.arraycopy(bytes, bytes.length - length, intBytes, 2, length); - state.add(intBytes); + byte[] bytes = i.toByteArray(); + if(n > Long.BYTES) { + state.add(POS_INT_END); + state.add((byte)n); + state.add(bytes, bytes.length - n, n); } else { - int n = ByteArrayUtil.bisectLeft(BIG_INT_SIZE_LIMITS, i); - assert n <= BIG_INT_SIZE_LIMITS.length; //System.out.println(" -- integral has 'n' of " + n + " and output bytes of " + bytes.length); - byte[] intBytes = new byte[n + 1]; - intBytes[0] = (byte) (INT_ZERO_CODE + n); - System.arraycopy(bytes, bytes.length - n, intBytes, 1, n); - state.add(intBytes); + state.add((byte)(INT_ZERO_CODE + n)); + state.add(bytes, bytes.length - n, n); } } else { - if(i.negate().compareTo(BIG_INT_SIZE_LIMITS[BIG_INT_SIZE_LIMITS.length - 1]) > 0) { - int length = byteLength(i.negate().toByteArray()); - if (length > 0xff) { - throw new IllegalArgumentException("BigInteger magnitude is too large (more than 255 bytes)"); + byte[] bytes = i.subtract(BigInteger.ONE).toByteArray(); + if(n > Long.BYTES) { + state.add(NEG_INT_START); + state.add((byte)(n ^ 0xff)); + if(bytes.length >= n) { + state.add(bytes, bytes.length - n, n); } - BigInteger offset = BigInteger.ONE.shiftLeft(length * 8).subtract(BigInteger.ONE); - byte[] adjusted = i.add(offset).toByteArray(); - byte[] intBytes = new byte[length + 2]; - intBytes[0] = NEG_INT_START; - intBytes[1] = (byte) (length ^ 0xff); - if (adjusted.length >= length) { - System.arraycopy(adjusted, adjusted.length - length, intBytes, 2, length); - } else { - Arrays.fill(intBytes, 2, intBytes.length - adjusted.length, (byte) 0x00); - System.arraycopy(adjusted, 0, intBytes, intBytes.length - adjusted.length, adjusted.length); + else { + for(int x = 0; x < n - bytes.length; x++) { + state.add((byte)0x00); + } + state.add(bytes, 0, bytes.length); } - state.add(intBytes); } else { - int n = ByteArrayUtil.bisectLeft(BIG_INT_SIZE_LIMITS, i.negate()); - - assert n >= 0 && n < BIG_INT_SIZE_LIMITS.length; // can we do this? it seems to be required for the following statement - - long maxv = BIG_INT_SIZE_LIMITS[n].add(i).longValue(); - byte[] adjustedBytes = ByteBuffer.allocate(8).order(ByteOrder.BIG_ENDIAN).putLong(maxv).array(); - byte[] intBytes = new byte[n + 1]; - intBytes[0] = (byte) (INT_ZERO_CODE - n); - System.arraycopy(adjustedBytes, adjustedBytes.length - n, intBytes, 1, n); - state.add(intBytes); + state.add((byte)(INT_ZERO_CODE - n)); + if(bytes.length >= n) { + state.add(bytes, bytes.length - n, n); + } + else { + for(int x = 0; x < n - bytes.length; x++) { + state.add((byte)0x00); + } + state.add(bytes, 0, bytes.length); + } } } } static void encode(EncodeState state, long i) { if(i == 0L) { - state.add(INT_ZERO_ARR); + state.add(INT_ZERO_CODE); return; } - int n = byteCount(i); - byte[] intBytes = new byte[n + 1]; + int n = minimalByteCount(i); // First byte encodes number of bytes (as difference from INT_ZERO_CODE) - intBytes[0] = (byte)(INT_ZERO_CODE + (i >= 0 ? n : -n)); + state.add((byte)(INT_ZERO_CODE + (i >= 0 ? n : -n))); // For positive integers, copy the bytes in big-endian order excluding leading 0x00 bytes. // For negative integers, copy the bytes of the one's complement representation excluding // the leading 0xff bytes. As Java stores negative values in two's complement, we subtract 1 // from negative values. long val = Long.reverseBytes((i >= 0) ? i : (i - 1)) >> (Long.SIZE - 8 * n); - for(int x = 1; x < intBytes.length; x++) { - intBytes[x] = (byte)(val & 0xff); + for(int x = 0; x < n; x++) { + state.add((byte)(val & 0xff)); val >>= 8; } - state.add(intBytes); } static void encode(EncodeState state, Float f) { - byte[] floatBytes = ByteBuffer.allocate(1 + Float.BYTES).order(ByteOrder.BIG_ENDIAN) - .put(FLOAT_CODE) - .putInt(encodeFloatBits(f)) - .array(); - state.add(floatBytes); + state.add(FLOAT_CODE).add(encodeFloatBits(f)); } static void encode(EncodeState state, Double d) { - byte[] doubleBytes = ByteBuffer.allocate(1 + Double.BYTES).order(ByteOrder.BIG_ENDIAN) - .put(DOUBLE_CODE) - .putLong(encodeDoubleBits(d)) - .array(); - state.add(doubleBytes); + state.add(DOUBLE_CODE).add(encodeDoubleBits(d)); } static void encode(EncodeState state, Boolean b) { - if(b) { - state.add(TRUE_ARR); - } - else { - state.add(FALSE_ARR); - } + state.add(b ? TRUE_CODE : FALSE_CODE); } static void encode(EncodeState state, UUID uuid) { - byte[] uuidBytes = ByteBuffer.allocate(17).put(UUID_CODE).order(ByteOrder.BIG_ENDIAN) - .putLong(uuid.getMostSignificantBits()).putLong(uuid.getLeastSignificantBits()) - .array(); - state.add(uuidBytes); + state.add(UUID_CODE).add(uuid.getMostSignificantBits()).add(uuid.getLeastSignificantBits()); } static void encode(EncodeState state, Versionstamp v) { - state.add(VERSIONSTAMP_ARR); + state.add(VERSIONSTAMP_CODE); if(v.isComplete()) { state.add(v.getBytes()); } @@ -394,11 +402,11 @@ class TupleUtil { } static void encode(EncodeState state, List value) { - state.add(NESTED_ARR); + state.add(NESTED_CODE); for(Object t : value) { encode(state, t, true); } - state.add(NULL_ARR); + state.add(nil); } static void decode(DecodeState state, byte[] rep, int pos, int last) { @@ -411,17 +419,32 @@ class TupleUtil { state.add(null, start); } else if(code == BYTES_CODE) { - int end = ByteArrayUtil.findTerminator(rep, (byte)0x0, (byte)0xff, start, last); + int end = state.findNullTerminator(rep, start, last); //System.out.println("End of byte string: " + end); - byte[] range = ByteArrayUtil.replace(rep, start, end - start, NULL_ESCAPED_ARR, new byte[] { nil }); + byte[] range; + if(state.nullCount == 0) { + range = Arrays.copyOfRange(rep, start, end); + } + else { + ByteBuffer dest = ByteBuffer.allocate(end - start - state.nullCount); + ByteArrayUtil.replace(rep, start, end - start, NULL_ESCAPED_ARR, NULL_ARR, dest); + range = dest.array(); + } //System.out.println(" -> byte string contents: '" + ArrayUtils.printable(range) + "'"); state.add(range, end + 1); } else if(code == STRING_CODE) { - int end = ByteArrayUtil.findTerminator(rep, (byte)0x0, (byte)0xff, start, last); + int end = state.findNullTerminator(rep, start, last); //System.out.println("End of UTF8 string: " + end); - byte[] stringBytes = ByteArrayUtil.replace(rep, start, end - start, NULL_ESCAPED_ARR, new byte[] { nil }); - String str = new String(stringBytes, UTF8); + String str; + if(state.nullCount == 0) { + str = new String(rep, start, end - start, UTF8); + } + else { + ByteBuffer dest = ByteBuffer.allocate(end - start - state.nullCount); + ByteArrayUtil.replace(rep, start, end - start, NULL_ESCAPED_ARR, NULL_ARR, dest); + str = new String(dest.array(), UTF8); + } //System.out.println(" -> UTF8 string contents: '" + str + "'"); state.add(str, end + 1); } @@ -442,19 +465,23 @@ class TupleUtil { state.add(true, start); } else if(code == UUID_CODE) { - ByteBuffer bb = ByteBuffer.wrap(rep, start, 16).order(ByteOrder.BIG_ENDIAN); + ByteBuffer bb = ByteBuffer.wrap(rep, start, 2 * Long.BYTES).order(ByteOrder.BIG_ENDIAN); long msb = bb.getLong(); long lsb = bb.getLong(); state.add(new UUID(msb, lsb), start + 16); } else if(code == POS_INT_END) { int n = rep[start] & 0xff; - BigInteger res = new BigInteger(ByteArrayUtil.join(new byte[]{0x00}, Arrays.copyOfRange(rep, start+1, start+n+1))); + byte[] intBytes = new byte[n + 1]; + System.arraycopy(rep, start + 1, intBytes, 1, n); + BigInteger res = new BigInteger(intBytes); state.add(res, start + n + 1); } else if(code == NEG_INT_START) { int n = (rep[start] ^ 0xff) & 0xff; - BigInteger origValue = new BigInteger(ByteArrayUtil.join(new byte[]{0x00}, Arrays.copyOfRange(rep, start+1, start+n+1))); + byte[] intBytes = new byte[n + 1]; + System.arraycopy(rep, start + 1, intBytes, 1, n); + BigInteger origValue = new BigInteger(intBytes); BigInteger offset = BigInteger.ONE.shiftLeft(n*8).subtract(BigInteger.ONE); state.add(origValue.subtract(offset), start + n + 1); } @@ -464,7 +491,7 @@ class TupleUtil { int n = positive ? code - INT_ZERO_CODE : INT_ZERO_CODE - code; int end = start + n; - if(rep.length < end) { + if(rep.length < last) { throw new RuntimeException("Invalid tuple (possible truncation)"); } @@ -509,9 +536,9 @@ class TupleUtil { else if(code == NESTED_CODE) { DecodeState subResult = new DecodeState(); int endPos = start; - while(endPos < rep.length) { + while(endPos < last) { if(rep[endPos] == nil) { - if(endPos + 1 < rep.length && rep[endPos+1] == (byte)0xff) { + if(endPos + 1 < last && rep[endPos+1] == (byte)0xff) { subResult.add(null, endPos + 2); endPos += 2; } else { @@ -631,19 +658,27 @@ class TupleUtil { //System.out.println("Joining whole tuple..."); } - static byte[] pack(List items, byte[] prefix) { - EncodeState state = new EncodeState(2 * items.size() + (prefix == null ? 0 : 1)); + static byte[] pack(List items, byte[] prefix, int expectedSize) { + ByteBuffer dest = ByteBuffer.allocate(expectedSize + (prefix != null ? prefix.length : 0)); + EncodeState state = new EncodeState(dest); + if(prefix != null) { + state.add(prefix); + } encodeAll(state, items, prefix); if(state.versionPos >= 0) { throw new IllegalArgumentException("Incomplete Versionstamp included in vanilla tuple packInternal"); } else { - return ByteArrayUtil.join(null, state.encodedValues); + return dest.array(); } } - static byte[] packWithVersionstamp(List items, byte[] prefix) { - EncodeState state = new EncodeState(2 * items.size() + (prefix == null ? 1 : 2)); + static byte[] packWithVersionstamp(List items, byte[] prefix, int expectedSize) { + ByteBuffer dest = ByteBuffer.allocate(expectedSize + (prefix != null ? prefix.length : 0)); + EncodeState state = new EncodeState(dest); + if(prefix != null) { + state.add(prefix); + } encodeAll(state, items, prefix); if(state.versionPos < 0) { throw new IllegalArgumentException("No incomplete Versionstamp included in tuple packInternal with versionstamp"); @@ -652,15 +687,73 @@ class TupleUtil { if(state.versionPos > 0xffff) { throw new IllegalArgumentException("Tuple has incomplete version at position " + state.versionPos + " which is greater than the maximum " + 0xffff); } + dest.order(ByteOrder.LITTLE_ENDIAN); if (FDB.instance().getAPIVersion() < 520) { - state.add(ByteBuffer.allocate(Short.BYTES).order(ByteOrder.LITTLE_ENDIAN).putShort((short)state.versionPos).array()); + dest.putShort((short)state.versionPos); } else { - state.add(ByteBuffer.allocate(Integer.BYTES).order(ByteOrder.LITTLE_ENDIAN).putInt(state.versionPos).array()); + dest.putInt(state.versionPos); } - return ByteArrayUtil.join(null, state.encodedValues); + return dest.array(); } } + static int getPackedSize(List items, boolean nested) { + int packedSize = 0; + for(Object item : items) { + if(item == null) + packedSize += nested ? 2 : 1; + else if(item instanceof byte[]) { + byte[] bytes = (byte[])item; + packedSize += 2 + bytes.length + ByteArrayUtil.nullCount((byte[])item); + } + else if(item instanceof String) { + try { + int strPackedSize = StringUtil.packedSize((String)item); + packedSize += 2 + strPackedSize; + } + catch (IllegalArgumentException e) { + // The unicode was malformed. Grab the array and count the bytes + byte[] strBytes = ((String)item).getBytes(UTF8); + packedSize += 2 + strBytes.length + ByteArrayUtil.nullCount(strBytes); + } + } + else if(item instanceof Float) + packedSize += 1 + Float.BYTES; + else if(item instanceof Double) + packedSize += 1 + Double.BYTES; + else if(item instanceof Boolean) + packedSize += 1; + else if(item instanceof UUID) + packedSize += 1 + 2 * Long.BYTES; + else if(item instanceof BigInteger) { + BigInteger bigInt = (BigInteger)item; + int byteCount = minimalByteCount(bigInt); + // If byteCount <= 8, then the encoding uses 1 byte for both the size + // and type code. If byteCount > 8, then there is 1 byte for the type code + // and 1 byte for the length. In both cases, the value is followed by + // the byte count. + packedSize += byteCount + ((byteCount <= 8) ? 1 : 2); + } + else if(item instanceof Number) + packedSize += 1 + minimalByteCount(((Number)item).longValue()); + else if(item instanceof Versionstamp) { + packedSize += 1 + Versionstamp.LENGTH; + Versionstamp versionstamp = (Versionstamp)item; + if(!versionstamp.isComplete()) { + int suffixSize = FDB.instance().getAPIVersion() < 520 ? Short.BYTES : Integer.BYTES; + packedSize += suffixSize; + } + } + else if(item instanceof List) + packedSize += 2 + getPackedSize((List)item, true); + else if(item instanceof Tuple) + packedSize += 2 + ((Tuple)item).getPackedSize(true); + else + throw new IllegalArgumentException("unknown type " + item.getClass() + " for tuple packing"); + } + return packedSize; + } + static boolean hasIncompleteVersionstamp(Stream items) { return items.anyMatch(item -> { if(item == null) { @@ -683,10 +776,10 @@ class TupleUtil { public static void main(String[] args) { try { - byte[] bytes = pack(Collections.singletonList(4), null); + byte[] bytes = pack(Collections.singletonList(4), null, 2); DecodeState result = new DecodeState(); decode(result, bytes, 0, bytes.length); - int val = (int)result.values.get(0); + int val = ((Number)result.values.get(0)).intValue(); assert 4 == val; } catch(Exception e) { @@ -695,7 +788,7 @@ class TupleUtil { } try { - byte[] bytes = pack(Collections.singletonList("\u021Aest \u0218tring"), null); + byte[] bytes = pack(Collections.singletonList("\u021Aest \u0218tring"), null, 15); DecodeState result = new DecodeState(); decode(result, bytes, 0, bytes.length); String string = (String)result.values.get(0); diff --git a/bindings/java/src/test/com/apple/foundationdb/test/AsyncStackTester.java b/bindings/java/src/test/com/apple/foundationdb/test/AsyncStackTester.java index 617586fe9d..f9d7d12c3a 100644 --- a/bindings/java/src/test/com/apple/foundationdb/test/AsyncStackTester.java +++ b/bindings/java/src/test/com/apple/foundationdb/test/AsyncStackTester.java @@ -412,7 +412,11 @@ public class AsyncStackTester { return inst.popParams(listSize).thenAcceptAsync(rawElements -> { List tuples = new ArrayList<>(listSize); for(Object o : rawElements) { - tuples.add(Tuple.fromBytes((byte[])o)); + // Unpacking a tuple keeps around the serialized representation and uses + // it for comparison if it's available. To test semantic comparison, recreate + // the tuple from the item list. + Tuple t = Tuple.fromBytes((byte[])o); + tuples.add(Tuple.fromList(t.getItems())); } Collections.sort(tuples); for(Tuple t : tuples) { diff --git a/bindings/java/src/test/com/apple/foundationdb/test/StackTester.java b/bindings/java/src/test/com/apple/foundationdb/test/StackTester.java index 96281dec72..06f9b435d5 100644 --- a/bindings/java/src/test/com/apple/foundationdb/test/StackTester.java +++ b/bindings/java/src/test/com/apple/foundationdb/test/StackTester.java @@ -368,9 +368,13 @@ public class StackTester { else if (op == StackOperation.TUPLE_SORT) { int listSize = StackUtils.getInt(inst.popParam().join()); List rawElements = inst.popParams(listSize).join(); - List tuples = new ArrayList(listSize); + List tuples = new ArrayList<>(listSize); for(Object o : rawElements) { - tuples.add(Tuple.fromBytes((byte[])o)); + // Unpacking a tuple keeps around the serialized representation and uses + // it for comparison if it's available. To test semantic comparison, recreate + // the tuple from the item list. + Tuple t = Tuple.fromBytes((byte[])o); + tuples.add(Tuple.fromList(t.getItems())); } Collections.sort(tuples); for(Tuple t : tuples) { diff --git a/bindings/java/src/test/com/apple/foundationdb/test/TuplePerformanceTest.java b/bindings/java/src/test/com/apple/foundationdb/test/TuplePerformanceTest.java index 3de9b76785..54448e3ac9 100644 --- a/bindings/java/src/test/com/apple/foundationdb/test/TuplePerformanceTest.java +++ b/bindings/java/src/test/com/apple/foundationdb/test/TuplePerformanceTest.java @@ -16,7 +16,8 @@ public class TuplePerformanceTest { private enum GeneratedTypes { ALL, LONG, - FLOATING_POINT + FLOATING_POINT, + STRING_LIKE } private final Random r; @@ -77,7 +78,7 @@ public class TuplePerformanceTest { values.add(nested); } } - return Tuple.fromItems(values); + return Tuple.fromList(values); } public Tuple createLongsTuple(int length) { @@ -91,7 +92,7 @@ public class TuplePerformanceTest { } values.add(val); } - return Tuple.fromItems(values); + return Tuple.fromList(values); } public Tuple createFloatingPointTuple(int length) { @@ -112,7 +113,41 @@ public class TuplePerformanceTest { values.add(Double.longBitsToDouble(r.nextLong())); } } - return Tuple.fromItems(values); + return Tuple.fromList(values); + } + + public Tuple createStringLikeTuple(int length) { + List values = new ArrayList<>(length); + for(int i = 0; i < length; i++) { + double choice = r.nextDouble(); + if(choice < 0.4) { + byte[] arr = new byte[r.nextInt(20)]; + r.nextBytes(arr); + values.add(arr); + } + else if(choice < 0.8) { + // Random ASCII codepoints + int[] codepoints = new int[r.nextInt(20)]; + for(int x = 0; x < codepoints.length; x++) { + codepoints[x] = r.nextInt(0x7F); + } + values.add(new String(codepoints, 0, codepoints.length)); + } + else if(choice < 0.9) { + // All zeroes + byte[] zeroes = new byte[r.nextInt(20)]; + values.add(zeroes); + } + else { + // Random Unicode codepoints + int[] codepoints = new int[r.nextInt(20)]; + for(int x = 0; x < codepoints.length; x++) { + codepoints[x] = r.nextInt(0x10FFFF); + } + values.add(new String(codepoints, 0, codepoints.length)); + } + } + return Tuple.fromList(values); } public Tuple createTuple(int length) { @@ -123,6 +158,8 @@ public class TuplePerformanceTest { return createLongsTuple(length); case FLOATING_POINT: return createFloatingPointTuple(length); + case STRING_LIKE: + return createStringLikeTuple(length); default: throw new IllegalStateException("unknown generated types " + generatedTypes); } @@ -143,6 +180,7 @@ public class TuplePerformanceTest { long unpackNanos = 0L; long equalsNanos = 0L; long equalsArrayNanos = 0L; + long sizeNanos = 0L; long hashNanos = 0L; long secondHashNanos = 0L; long subspacePackNanos = 0L; @@ -150,6 +188,9 @@ public class TuplePerformanceTest { long totalLength = 0L; long totalBytes = 0L; for(int i = 0; i < iterations; i++) { + if(i % 100_000 == 0) { + System.out.println(" iteration " + i); + } int length = r.nextInt(20); Tuple t = createTuple(length); @@ -157,8 +198,8 @@ public class TuplePerformanceTest { byte[] serialized = t.pack(); long endNanos = System.nanoTime(); packNanos += endNanos - startNanos; - totalLength += length; - totalBytes += serialized.length; + totalLength += t.size(); + totalBytes += t.getPackedSize(); startNanos = System.nanoTime(); Tuple t2 = Tuple.fromBytes(serialized); @@ -182,6 +223,15 @@ public class TuplePerformanceTest { endNanos = System.nanoTime(); equalsArrayNanos += endNanos - startNanos; + tCopy = Tuple.fromList(t.getItems()); + startNanos = System.nanoTime(); + int size = tCopy.getPackedSize(); + endNanos = System.nanoTime(); + if (size != t.pack().length) { + throw new RuntimeException("packed size did not match actual packed length: " + t + " -- " + " " + tCopy.getPackedSize() + " instead of " + t.getPackedSize()); + } + sizeNanos += endNanos - startNanos; + startNanos = System.nanoTime(); byte[] subspacePacked = subspace.pack(t); endNanos = System.nanoTime(); @@ -229,6 +279,8 @@ public class TuplePerformanceTest { System.out.printf(" Equals time per tuple: %f \u03BCs%n", equalsNanos * 1e-3 / iterations); System.out.printf(" Equals time (using packed): %f s%n", equalsArrayNanos * 1e-9); System.out.printf(" Equals time (using packed) per tuple: %f \u03BCs%n", equalsArrayNanos * 1e-3 / iterations); + System.out.printf(" Size time: %f s%n", sizeNanos * 1e-9); + System.out.printf(" Size time per tuple: %f \u03BCs%n", sizeNanos * 1e-3 / iterations); System.out.printf(" Subspace pack time: %f s%n", subspacePackNanos * 1e-9); System.out.printf(" Subspace pack time per tuple: %f \u03BCs%n", subspacePackNanos * 1e-3 / iterations); System.out.printf(" Subspace unpack time: %f s%n", subspaceUnpackNanos * 1e-9); diff --git a/bindings/java/src/test/com/apple/foundationdb/test/TupleTest.java b/bindings/java/src/test/com/apple/foundationdb/test/TupleTest.java index 305c1a90f0..2f0fd1c2c4 100644 --- a/bindings/java/src/test/com/apple/foundationdb/test/TupleTest.java +++ b/bindings/java/src/test/com/apple/foundationdb/test/TupleTest.java @@ -20,10 +20,6 @@ package com.apple.foundationdb.test; -import com.apple.foundationdb.TransactionContext; -import com.apple.foundationdb.tuple.ByteArrayUtil; -import com.apple.foundationdb.tuple.Tuple; - import java.math.BigInteger; import java.util.ArrayList; import java.util.Arrays; @@ -31,6 +27,11 @@ import java.util.List; import java.util.Objects; import java.util.UUID; +import com.apple.foundationdb.TransactionContext; +import com.apple.foundationdb.tuple.ByteArrayUtil; +import com.apple.foundationdb.tuple.Tuple; +import com.apple.foundationdb.tuple.Versionstamp; + public class TupleTest { private static final byte FF = (byte)0xff; @@ -40,6 +41,7 @@ public class TupleTest { // FDB fdb = FDB.selectAPIVersion(610); serializedForms(); comparisons(); + replaceTests(); /* try(Database db = fdb.open()) { runTests(reps, db); @@ -70,6 +72,7 @@ public class TupleTest { private static void serializedForms() { List serializations = new ArrayList<>(); TupleSerialization.addAll(serializations, + Tuple.from(), new byte[0], Tuple.from(0L), new byte[]{0x14}, Tuple.from(BigInteger.ZERO), new byte[]{0x14}, Tuple.from(1L), new byte[]{0x15, 0x01}, @@ -116,6 +119,9 @@ public class TupleTest { Tuple.from(Double.longBitsToDouble(Long.MAX_VALUE)), new byte[]{0x21, FF, FF, FF, FF, FF, FF, FF, FF}, Tuple.from(Float.intBitsToFloat(~0)), new byte[]{0x20, 0x00, 0x00, 0x00, 0x00}, Tuple.from(Double.longBitsToDouble(~0L)), new byte[]{0x21, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00}, + Tuple.from((Object)new byte[0]), new byte[]{0x01, 0x00}, + Tuple.from((Object)new byte[]{0x01, 0x02, 0x03}), new byte[]{0x01, 0x01, 0x02, 0x03, 0x00}, + Tuple.from((Object)new byte[]{0x00, 0x00, 0x00, 0x04}), new byte[]{0x01, 0x00, FF, 0x00, FF, 0x00, FF, 0x04, 0x00}, Tuple.from(""), new byte[]{0x02, 0x00}, Tuple.from("hello"), new byte[]{0x02, 'h', 'e', 'l', 'l', 'o', 0x00}, Tuple.from("\u4e2d\u6587"), new byte[]{0x02, (byte)0xe4, (byte)0xb8, (byte)0xad, (byte)0xe6, (byte)0x96, (byte)0x87, 0x00}, @@ -123,17 +129,42 @@ public class TupleTest { Tuple.from(new String(new int[]{0x1f525}, 0, 1)), new byte[]{0x02, (byte)0xf0, (byte)0x9f, (byte)0x94, (byte)0xa5, 0x00}, Tuple.from("\ud83d\udd25"), new byte[]{0x02, (byte)0xf0, (byte)0x9f, (byte)0x94, (byte)0xa5, 0x00}, Tuple.from("\ud83e\udd6f"), new byte[]{0x02, (byte)0xf0, (byte)0x9f, (byte)0xa5, (byte)0xaf, 0x00}, + Tuple.from("\ud83d"), new byte[]{0x02, 0x3f, 0x00}, Tuple.from("\udd25\ud83e\udd6f"), new byte[]{0x02, 0x3f, (byte)0xf0, (byte)0x9f, (byte)0xa5, (byte)0xaf, 0x00}, // malformed string - low surrogate without high surrogate - Tuple.from("a\udd25\ud83e\udd6f"), new byte[]{0x02, 'a', 0x3f, (byte)0xf0, (byte)0x9f, (byte)0xa5, (byte)0xaf, 0x00} // malformed string - low surrogate without high surrogate + Tuple.from("a\udd25\ud83e\udd6f"), new byte[]{0x02, 'a', 0x3f, (byte)0xf0, (byte)0x9f, (byte)0xa5, (byte)0xaf, 0x00}, // malformed string - low surrogate without high surrogate + Tuple.from(Tuple.from((Object)null)), new byte[]{0x05, 0x00, FF, 0x00}, + Tuple.from(Tuple.from(null, "hello")), new byte[]{0x05, 0x00, FF, 0x02, 'h', 'e', 'l', 'l', 'o', 0x00, 0x00}, + Tuple.from(Arrays.asList(null, "hello")), new byte[]{0x05, 0x00, FF, 0x02, 'h', 'e', 'l', 'l', 'o', 0x00, 0x00}, + Tuple.from(Tuple.from(null, "hell\0")), new byte[]{0x05, 0x00, FF, 0x02, 'h', 'e', 'l', 'l', 0x00, FF, 0x00, 0x00}, + Tuple.from(Arrays.asList(null, "hell\0")), new byte[]{0x05, 0x00, FF, 0x02, 'h', 'e', 'l', 'l', 0x00, FF, 0x00, 0x00}, + Tuple.from(Tuple.from((Object)null), "hello"), new byte[]{0x05, 0x00, FF, 0x00, 0x02, 'h', 'e', 'l', 'l', 'o', 0x00}, + Tuple.from(Tuple.from((Object)null), "hello", new byte[]{0x01, 0x00}, new byte[0]), new byte[]{0x05, 0x00, FF, 0x00, 0x02, 'h', 'e', 'l', 'l', 'o', 0x00, 0x01, 0x01, 0x00, FF, 0x00, 0x01, 0x00}, + Tuple.from(new UUID(0xba5eba11, 0x5ca1ab1e)), new byte[]{0x30, FF, FF, FF, FF, (byte)0xba, 0x5e, (byte)0xba, 0x11, 0x00, 0x00, 0x00, 0x00, 0x5c, (byte)0xa1, (byte)0xab, 0x1e}, + Tuple.from(false), new byte[]{0x26}, + Tuple.from(true), new byte[]{0x27}, + Tuple.from((short)0x3019), new byte[]{0x16, 0x30, 0x19}, + Tuple.from((byte)0x03), new byte[]{0x15, 0x03}, + Tuple.from(Versionstamp.complete(new byte[]{(byte)0xaa, (byte)0xbb, (byte)0xcc, (byte)0xdd, (byte)0xee, FF, 0x00, 0x01, 0x02, 0x03})), new byte[]{0x33, (byte)0xaa, (byte)0xbb, (byte)0xcc, (byte)0xdd, (byte)0xee, FF, 0x00, 0x01, 0x02, 0x03, 0x00, 0x00}, + Tuple.from(Versionstamp.complete(new byte[]{0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0a}, 657)), new byte[]{0x33, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0a, 0x02, (byte)0x91} ); + Tuple bigTuple = new Tuple(); + List serializedForms = new ArrayList<>(); + for(TupleSerialization serialization : serializations) { + bigTuple = bigTuple.addAll(serialization.tuple); + serializedForms.add(serialization.serialization); + } + serializations.add(new TupleSerialization(bigTuple, ByteArrayUtil.join(null, serializedForms))); for(TupleSerialization serialization : serializations) { System.out.println("Packing " + serialization.tuple + " (expecting: " + ByteArrayUtil.printable(serialization.serialization) + ")"); + if(serialization.tuple.getPackedSize() != serialization.serialization.length) { + throw new RuntimeException("Tuple " + serialization.tuple + " packed size " + serialization.tuple.getPackedSize() + " does not match expected packed size " + serialization.serialization.length); + } if(!Arrays.equals(serialization.tuple.pack(), serialization.serialization)) { throw new RuntimeException("Tuple " + serialization.tuple + " has serialization " + ByteArrayUtil.printable(serialization.tuple.pack()) + " which does not match expected serialization " + ByteArrayUtil.printable(serialization.serialization)); } - if(!Objects.equals(serialization.tuple, Tuple.fromBytes(serialization.serialization))) { + if(!Objects.equals(serialization.tuple, Tuple.fromItems(Tuple.fromBytes(serialization.serialization).getItems()))) { throw new RuntimeException("Tuple " + serialization.tuple + " does not match deserialization " + Tuple.fromBytes(serialization.serialization) + " which comes from serialization " + ByteArrayUtil.printable(serialization.serialization)); } @@ -176,6 +207,16 @@ public class TupleTest { Tuple.from((Object)new byte[]{0x00, FF}), Tuple.from((Object)new byte[]{0x7f}), Tuple.from((Object)new byte[]{(byte)0x80}), + Tuple.from(null, new byte[0]), + Tuple.from(null, new byte[]{0x00}), + Tuple.from(null, new byte[]{0x00, FF}), + Tuple.from(null, new byte[]{0x7f}), + Tuple.from(null, new byte[]{(byte)0x80}), + Tuple.from(Tuple.from(null, new byte[0])), + Tuple.from(Tuple.from(null, new byte[]{0x00})), + Tuple.from(Tuple.from(null, new byte[]{0x00, FF})), + Tuple.from(Tuple.from(null, new byte[]{0x7f})), + Tuple.from(Tuple.from(null, new byte[]{(byte)0x80})), Tuple.from("a"), Tuple.from("\u03bc\u03ac\u03b8\u03b7\u03bc\u03b1"), Tuple.from("\u03bc\u03b1\u0301\u03b8\u03b7\u03bc\u03b1"), @@ -195,7 +236,18 @@ public class TupleTest { Tuple.from(new UUID(-1, 0)), Tuple.from(new UUID(-1, -1)), Tuple.from(new UUID(1, -1)), - Tuple.from(new UUID(1, 1)) + Tuple.from(new UUID(1, 1)), + Tuple.from(false), + Tuple.from(true), + Tuple.from(Arrays.asList(0, 1, 2)), + Tuple.from(Arrays.asList(0, 1), "hello"), + Tuple.from(Arrays.asList(0, 1), "help"), + Tuple.from(Versionstamp.complete(new byte[]{0x0a, (byte)0xbb, (byte)0xcc, (byte)0xdd, (byte)0xee, FF, 0x00, 0x01, 0x02, 0x03})), + Tuple.from(Versionstamp.complete(new byte[]{(byte)0xaa, (byte)0xbb, (byte)0xcc, (byte)0xdd, (byte)0xee, FF, 0x00, 0x01, 0x02, 0x03})), + Tuple.from(Versionstamp.complete(new byte[]{(byte)0xaa, (byte)0xbb, (byte)0xcc, (byte)0xdd, (byte)0xee, FF, 0x00, 0x01, 0x02, 0x03}, 1)), + Tuple.from(Versionstamp.complete(new byte[]{(byte)0xaa, (byte)0xbb, (byte)0xcc, (byte)0xdd, (byte)0xee, FF, 0x00, 0x01, 0x02, 0x03}, 0xa101)), + Tuple.from(Versionstamp.complete(new byte[]{(byte)0xaa, (byte)0xbb, (byte)0xcc, (byte)0xdd, (byte)0xee, FF, 0x00, 0x01, 0x02, 0x03}, 65535)) + ); for(Tuple t1 : tuples) { @@ -209,6 +261,47 @@ public class TupleTest { if(Integer.signum(semanticComparison) != Integer.signum(byteComparison)) { throw new RuntimeException("Tuple t1 and t2 comparison mismatched: semantic = " + semanticComparison + " while byte order = " + byteComparison); } + int implicitByteComparison = t1.compareTo(t2); + if(Integer.signum(semanticComparison) != Integer.signum(implicitByteComparison)) { + throw new RuntimeException("Tuple t1 and t2 comparison mismatched: semantic = " + semanticComparison + " while implicit byte order = " + implicitByteComparison); + } + } + } + } + + // These should be in ArrayUtilTest, but those can't be run at the moment, so here they go. + private static void replaceTests() { + List arrays = Arrays.asList( + new byte[]{0x01, 0x02, 0x01, 0x02}, new byte[]{0x01, 0x02}, new byte[]{0x03, 0x04}, new byte[]{0x03, 0x04, 0x03, 0x04}, + new byte[]{0x01, 0x02, 0x01, 0x02}, new byte[]{0x01, 0x02}, new byte[]{0x03}, new byte[]{0x03, 0x03}, + new byte[]{0x01, 0x02, 0x01, 0x02}, new byte[]{0x01, 0x02}, new byte[]{0x03, 0x04, 0x05}, new byte[]{0x03, 0x04, 0x05, 0x03, 0x04, 0x05}, + new byte[]{0x00, 0x01, 0x02, 0x00, 0x01, 0x02, 0x00}, new byte[]{0x01, 0x02}, new byte[]{0x03, 0x04, 0x05}, new byte[]{0x00, 0x03, 0x04, 0x05, 0x00, 0x03, 0x04, 0x05, 0x00}, + new byte[]{0x01, 0x01, 0x01, 0x01}, new byte[]{0x01, 0x02}, new byte[]{0x03, 0x04}, new byte[]{0x01, 0x01, 0x01, 0x01}, + new byte[]{0x01, 0x01, 0x01, 0x01}, new byte[]{0x01, 0x02}, new byte[]{0x03}, new byte[]{0x01, 0x01, 0x01, 0x01}, + new byte[]{0x01, 0x01, 0x01, 0x01}, new byte[]{0x01, 0x02}, new byte[]{0x03, 0x04, 0x05}, new byte[]{0x01, 0x01, 0x01, 0x01}, + new byte[]{0x01, 0x01, 0x01, 0x01, 0x01}, new byte[]{0x01, 0x01}, new byte[]{0x03, 0x04, 0x05}, new byte[]{0x03, 0x04, 0x05, 0x03, 0x04, 0x05, 0x01}, + new byte[]{0x01, 0x01, 0x01, 0x01, 0x01}, new byte[]{0x01, 0x01}, new byte[]{0x03, 0x04}, new byte[]{0x03, 0x04, 0x03, 0x04, 0x01}, + new byte[]{0x01, 0x01, 0x01, 0x01, 0x01}, new byte[]{0x01, 0x01}, new byte[]{0x03}, new byte[]{0x03, 0x03, 0x01}, + new byte[]{0x01, 0x02, 0x01, 0x02}, new byte[]{0x01, 0x02}, null, new byte[0], + new byte[]{0x01, 0x02, 0x01, 0x02}, new byte[]{0x01, 0x02}, new byte[0], new byte[0], + new byte[]{0x01, 0x02, 0x01, 0x02}, null, new byte[]{0x04}, new byte[]{0x01, 0x02, 0x01, 0x02}, + new byte[]{0x01, 0x02, 0x01, 0x02}, new byte[0], new byte[]{0x04}, new byte[]{0x01, 0x02, 0x01, 0x02}, + null, new byte[]{0x01, 0x02}, new byte[]{0x04}, null + ); + for(int i = 0; i < arrays.size(); i += 4) { + byte[] src = arrays.get(i); + byte[] pattern = arrays.get(i + 1); + byte[] replacement = arrays.get(i + 2); + byte[] expectedResults = arrays.get(i + 3); + byte[] results = ByteArrayUtil.replace(src, pattern, replacement); + if(!Arrays.equals(results, expectedResults)) { + throw new RuntimeException("results " + ByteArrayUtil.printable(results) + " did not match expected results " + + ByteArrayUtil.printable(expectedResults) + " when replacing " + ByteArrayUtil.printable(pattern) + + " with " + ByteArrayUtil.printable(replacement) + " in " + ByteArrayUtil.printable(src)); + } + if(src != null && src == results) { + throw new RuntimeException("src and results array are pointer-equal when replacing " + ByteArrayUtil.printable(pattern) + + " with " + ByteArrayUtil.printable(replacement) + " in " + ByteArrayUtil.printable(src)); } } } From 39fd30330f95454ee46486a9fe7dd54d5ade26ac Mon Sep 17 00:00:00 2001 From: Alec Grieser Date: Wed, 27 Feb 2019 20:25:30 -0800 Subject: [PATCH 28/46] memoize incomplete versionstamp information in Tuples ; add more tests --- .../apple/foundationdb/subspace/Subspace.java | 7 +- .../tuple/IterableComparator.java | 2 +- .../com/apple/foundationdb/tuple/Tuple.java | 291 ++++---- .../apple/foundationdb/tuple/TupleUtil.java | 117 ++-- .../foundationdb/tuple/Versionstamp.java | 4 +- .../apple/foundationdb/test/TupleTest.java | 620 +++++++++++++++++- 6 files changed, 862 insertions(+), 179 deletions(-) diff --git a/bindings/java/src/main/com/apple/foundationdb/subspace/Subspace.java b/bindings/java/src/main/com/apple/foundationdb/subspace/Subspace.java index 59c3f94329..4b811f5149 100644 --- a/bindings/java/src/main/com/apple/foundationdb/subspace/Subspace.java +++ b/bindings/java/src/main/com/apple/foundationdb/subspace/Subspace.java @@ -46,8 +46,8 @@ import com.apple.foundationdb.tuple.Versionstamp; *

*/ public class Subspace { - static final Tuple EMPTY_TUPLE = Tuple.from(); - static final byte[] EMPTY_BYTES = new byte[0]; + private static final Tuple EMPTY_TUPLE = Tuple.from(); + private static final byte[] EMPTY_BYTES = new byte[0]; private final byte[] rawPrefix; @@ -248,8 +248,7 @@ public class Subspace { * @return the {@link Range} of keyspace corresponding to {@code tuple} */ public Range range(Tuple tuple) { - Range p = tuple.range(); - return new Range(join(rawPrefix, p.begin), join(rawPrefix, p.end)); + return tuple.range(rawPrefix); } /** diff --git a/bindings/java/src/main/com/apple/foundationdb/tuple/IterableComparator.java b/bindings/java/src/main/com/apple/foundationdb/tuple/IterableComparator.java index 1587b3fd6e..71aa23e9b1 100644 --- a/bindings/java/src/main/com/apple/foundationdb/tuple/IterableComparator.java +++ b/bindings/java/src/main/com/apple/foundationdb/tuple/IterableComparator.java @@ -34,7 +34,7 @@ import java.util.Iterator; * tuple1.compareTo(tuple2) * == new IterableComparator().compare(tuple1, tuple2) * == new IterableComparator().compare(tuple1.getItems(), tuple2.getItems()), - * == ByteArrayUtil.compareUnsigned(tuple1.packInternal(), tuple2.packInternal())} + * == ByteArrayUtil.compareUnsigned(tuple1.pack(), tuple2.pack())} * * *

diff --git a/bindings/java/src/main/com/apple/foundationdb/tuple/Tuple.java b/bindings/java/src/main/com/apple/foundationdb/tuple/Tuple.java index 5fa9726c14..ea47870037 100644 --- a/bindings/java/src/main/com/apple/foundationdb/tuple/Tuple.java +++ b/bindings/java/src/main/com/apple/foundationdb/tuple/Tuple.java @@ -21,11 +21,11 @@ package com.apple.foundationdb.tuple; import java.math.BigInteger; +import java.nio.ByteBuffer; import java.util.ArrayList; import java.util.Arrays; import java.util.Collections; import java.util.Iterator; -import java.util.LinkedList; import java.util.List; import java.util.UUID; import java.util.stream.Collectors; @@ -69,19 +69,39 @@ import com.apple.foundationdb.Range; */ public class Tuple implements Comparable, Iterable { private static final IterableComparator comparator = new IterableComparator(); + private static final byte[] EMPTY_BYTES = new byte[0]; - private List elements; - private int memoizedHash = 0; + List elements; private byte[] packed = null; + private int memoizedHash = 0; private int memoizedPackedSize = -1; + private final boolean incompleteVersionstamp; - private Tuple(List elements, Object newItem) { - this(elements); + private Tuple(Tuple original, Object newItem, boolean itemHasIncompleteVersionstamp) { + this.elements = new ArrayList<>(original.elements.size() + 1); + this.elements.addAll(original.elements); this.elements.add(newItem); + incompleteVersionstamp = original.incompleteVersionstamp || itemHasIncompleteVersionstamp; } - private Tuple(List elements) { - this.elements = new ArrayList<>(elements); + private Tuple(List elements) { + this.elements = elements; + incompleteVersionstamp = TupleUtil.hasIncompleteVersionstamp(elements.stream()); + } + + /** + * Construct a new empty {@code Tuple}. After creation, items can be added + * with calls to the variations of {@code add()}. + * + * @see #from(Object...) + * @see #fromBytes(byte[]) + * @see #fromItems(Iterable) + */ + public Tuple() { + elements = Collections.emptyList(); + packed = EMPTY_BYTES; + memoizedPackedSize = 0; + incompleteVersionstamp = false; } /** @@ -107,7 +127,10 @@ public class Tuple implements Comparable, Iterable { !(o instanceof Versionstamp)) { throw new IllegalArgumentException("Parameter type (" + o.getClass().getName() + ") not recognized"); } - return new Tuple(this.elements, o); + return new Tuple(this, o, + (o instanceof Versionstamp && !((Versionstamp)o).isComplete()) || + (o instanceof List && TupleUtil.hasIncompleteVersionstamp(((List)o).stream())) || + (o instanceof Tuple && ((Tuple) o).hasIncompleteVersionstamp())); } /** @@ -118,7 +141,7 @@ public class Tuple implements Comparable, Iterable { * @return a newly created {@code Tuple} */ public Tuple add(String s) { - return new Tuple(this.elements, s); + return new Tuple(this, s, false); } /** @@ -129,7 +152,7 @@ public class Tuple implements Comparable, Iterable { * @return a newly created {@code Tuple} */ public Tuple add(long l) { - return new Tuple(this.elements, l); + return new Tuple(this, l, false); } /** @@ -140,7 +163,7 @@ public class Tuple implements Comparable, Iterable { * @return a newly created {@code Tuple} */ public Tuple add(byte[] b) { - return new Tuple(this.elements, b); + return new Tuple(this, b, false); } /** @@ -151,7 +174,7 @@ public class Tuple implements Comparable, Iterable { * @return a newly created {@code Tuple} */ public Tuple add(boolean b) { - return new Tuple(this.elements, b); + return new Tuple(this, b, false); } /** @@ -162,7 +185,7 @@ public class Tuple implements Comparable, Iterable { * @return a newly created {@code Tuple} */ public Tuple add(UUID uuid) { - return new Tuple(this.elements, uuid); + return new Tuple(this, uuid, false); } /** @@ -178,7 +201,7 @@ public class Tuple implements Comparable, Iterable { if(bi == null) { throw new NullPointerException("Number types in Tuple cannot be null"); } - return new Tuple(this.elements, bi); + return new Tuple(this, bi, false); } /** @@ -189,7 +212,7 @@ public class Tuple implements Comparable, Iterable { * @return a newly created {@code Tuple} */ public Tuple add(float f) { - return new Tuple(this.elements, f); + return new Tuple(this, f, false); } /** @@ -200,7 +223,7 @@ public class Tuple implements Comparable, Iterable { * @return a newly created {@code Tuple} */ public Tuple add(double d) { - return new Tuple(this.elements, d); + return new Tuple(this, d, false); } /** @@ -212,11 +235,11 @@ public class Tuple implements Comparable, Iterable { * @return a newly created {@code Tuple} */ public Tuple add(Versionstamp v) { - return new Tuple(this.elements, v); + return new Tuple(this, v, !v.isComplete()); } /** - * Creates a copy of this {@code Tuple} with an {@link List} appended as the last element. + * Creates a copy of this {@code Tuple} with a {@link List} appended as the last element. * This does not add the elements individually (for that, use {@link Tuple#addAll(List) Tuple.addAll}). * This adds the list as a single element nested within the outer {@code Tuple}. * @@ -224,8 +247,8 @@ public class Tuple implements Comparable, Iterable { * * @return a newly created {@code Tuple} */ - public Tuple add(List l) { - return new Tuple(this.elements, l); + public Tuple add(List l) { + return new Tuple(this, l, TupleUtil.hasIncompleteVersionstamp(l.stream())); } /** @@ -238,7 +261,7 @@ public class Tuple implements Comparable, Iterable { * @return a newly created {@code Tuple} */ public Tuple add(Tuple t) { - return new Tuple(this.elements, t); + return new Tuple(this, t, t.hasIncompleteVersionstamp()); } /** @@ -251,7 +274,7 @@ public class Tuple implements Comparable, Iterable { * @return a newly created {@code Tuple} */ public Tuple add(byte[] b, int offset, int length) { - return new Tuple(this.elements, Arrays.copyOfRange(b, offset, offset + length)); + return new Tuple(this, Arrays.copyOfRange(b, offset, offset + length), false); } /** @@ -262,7 +285,7 @@ public class Tuple implements Comparable, Iterable { * * @return a newly created {@code Tuple} */ - public Tuple addAll(List o) { + public Tuple addAll(List o) { List merged = new ArrayList<>(o.size() + this.elements.size()); merged.addAll(this.elements); merged.addAll(o); @@ -279,8 +302,15 @@ public class Tuple implements Comparable, Iterable { public Tuple addAll(Tuple other) { List merged = new ArrayList<>(this.size() + other.size()); merged.addAll(this.elements); - merged.addAll(other.peekItems()); - return new Tuple(merged); + merged.addAll(other.elements); + Tuple t = new Tuple(merged); + if(!t.hasIncompleteVersionstamp() && packed != null && other.packed != null) { + t.packed = ByteArrayUtil.join(packed, other.packed); + } + if(memoizedPackedSize >= 0 && other.memoizedPackedSize >= 0) { + t.memoizedPackedSize = memoizedPackedSize + other.memoizedPackedSize; + } + return t; } /** @@ -306,29 +336,44 @@ public class Tuple implements Comparable, Iterable { } byte[] packInternal(byte[] prefix, boolean copy) { - boolean hasPrefix = prefix != null && prefix.length > 1; - if(packed == null) { - byte[] result = TupleUtil.pack(elements, prefix, getPackedSize()); - if(hasPrefix) { - packed = Arrays.copyOfRange(result, prefix.length, result.length); - memoizedPackedSize = packed.length; - return result; - } - else { - packed = result; - memoizedPackedSize = packed.length; - } + if(hasIncompleteVersionstamp()) { + throw new IllegalArgumentException("Incomplete Versionstamp included in vanilla tuple pack"); } + if(packed == null) { + packed = TupleUtil.pack(elements, getPackedSize()); + } + boolean hasPrefix = prefix != null && prefix.length > 0; if(hasPrefix) { return ByteArrayUtil.join(prefix, packed); } + else if(copy) { + return Arrays.copyOf(packed, packed.length); + } else { - if(copy) { - return Arrays.copyOf(packed, packed.length); - } - else { - return packed; - } + return packed; + } + } + + /** + * Pack an encoded representation of this {@code Tuple} onto the end of the given {@link ByteBuffer}. + * It is up to the caller to ensure that there is enough space allocated within the buffer + * to avoid {@link java.nio.BufferOverflowException}s. The client may call {@link #getPackedSize()} + * to determine how large this {@code Tuple} will be once packed in order to allocate sufficient memory. + *
+ *
+ * This method will throw an error if there are any incomplete {@link Versionstamp}s in this {@code Tuple}. + * + * @param dest the destination {@link ByteBuffer} for the encoded {@code Tuple} + */ + public void packInto(ByteBuffer dest) { + if(hasIncompleteVersionstamp()) { + throw new IllegalArgumentException("Incomplete Versionstamp included in vanilla tuple pack"); + } + if(packed == null) { + TupleUtil.pack(dest, elements); + } + else { + dest.put(packed); } } @@ -363,37 +408,27 @@ public class Tuple implements Comparable, Iterable { * @throws IllegalArgumentException if there is not exactly one incomplete {@link Versionstamp} included in this {@code Tuple} */ public byte[] packWithVersionstamp(byte[] prefix) { - return TupleUtil.packWithVersionstamp(elements, prefix, getPackedSize()); + return packWithVersionstampInternal(prefix, true); } byte[] packWithVersionstampInternal(byte[] prefix, boolean copy) { - boolean hasPrefix = prefix != null && prefix.length > 0; - if(packed == null) { - byte[] result = TupleUtil.packWithVersionstamp(elements, prefix, getPackedSize()); - if(hasPrefix) { - byte[] withoutPrefix = Arrays.copyOfRange(result, prefix.length, result.length); - TupleUtil.adjustVersionPosition(packed, -1 * prefix.length); - packed = withoutPrefix; - memoizedPackedSize = packed.length; - return result; - } - else { - packed = result; - memoizedPackedSize = packed.length; - } + if(!hasIncompleteVersionstamp()) { + throw new IllegalArgumentException("No incomplete Versionstamp included in tuple pack with versionstamp"); } + if(packed == null) { + packed = TupleUtil.packWithVersionstamp(elements, getPackedSize()); + } + boolean hasPrefix = prefix != null && prefix.length > 0; if(hasPrefix) { byte[] withPrefix = ByteArrayUtil.join(prefix, packed); TupleUtil.adjustVersionPosition(withPrefix, prefix.length); return withPrefix; } + else if(copy) { + return Arrays.copyOf(packed, packed.length); + } else { - if(copy) { - return Arrays.copyOf(packed, packed.length); - } - else { - return packed; - } + return packed; } } @@ -429,16 +464,6 @@ public class Tuple implements Comparable, Iterable { return elements.stream(); } - /** - * Returns the internal elements that make up this tuple. For internal use only, as - * modifications to the result will mean that this Tuple is modified. - * - * @return the elements of this Tuple, without copying - */ - private List peekItems() { - return this.elements; - } - /** * Gets an {@code Iterator} over the {@code Objects} in this {@code Tuple}. This {@code Iterator} is * unmodifiable and will throw an exception if {@link Iterator#remove() remove()} is called. @@ -450,18 +475,6 @@ public class Tuple implements Comparable, Iterable { return Collections.unmodifiableList(this.elements).iterator(); } - /** - * Construct a new empty {@code Tuple}. After creation, items can be added - * with calls the the variations of {@code add()}. - * - * @see #from(Object...) - * @see #fromBytes(byte[]) - * @see #fromItems(Iterable) - */ - public Tuple() { - this.elements = new LinkedList<>(); - } - /** * Construct a new {@code Tuple} with elements decoded from a supplied {@code byte} array. * The passed byte array must not be {@code null}. @@ -485,9 +498,15 @@ public class Tuple implements Comparable, Iterable { * @return a new {@code Tuple} constructed by deserializing the specified slice of the provided {@code byte} array */ public static Tuple fromBytes(byte[] bytes, int offset, int length) { - Tuple t = new Tuple(); - t.elements = TupleUtil.unpack(bytes, offset, length); - t.packed = Arrays.copyOfRange(bytes, offset, offset + length); + if(offset < 0 || offset > bytes.length) { + throw new IllegalArgumentException("Invalid offset for Tuple deserialization"); + } + if(length < 0 || offset + length > bytes.length) { + throw new IllegalArgumentException("Invalid length for Tuple deserialization"); + } + byte[] packed = Arrays.copyOfRange(bytes, offset, offset + length); + Tuple t = new Tuple(TupleUtil.unpack(packed)); + t.packed = packed; t.memoizedPackedSize = length; return t; } @@ -732,7 +751,7 @@ public class Tuple implements Comparable, Iterable { return (Tuple)o; } else if(o instanceof List) { - return Tuple.fromItems((List)o); + return Tuple.fromList((List)o); } else { throw new ClassCastException("Cannot convert item of type " + o.getClass() + " to tuple"); @@ -761,11 +780,7 @@ public class Tuple implements Comparable, Iterable { if(elements.isEmpty()) throw new IllegalStateException("Tuple contains no elements"); - List items = new ArrayList<>(elements.size() - 1); - for(int i = 1; i < this.elements.size(); i++) { - items.add(this.elements.get(i)); - } - return new Tuple(items); + return new Tuple(elements.subList(1, elements.size())); } /** @@ -779,11 +794,7 @@ public class Tuple implements Comparable, Iterable { if(elements.isEmpty()) throw new IllegalStateException("Tuple contains no elements"); - List items = new ArrayList<>(elements.size() - 1); - for(int i = 0; i < this.elements.size() - 1; i++) { - items.add(this.elements.get(i)); - } - return new Tuple(items); + return new Tuple(elements.subList(0, elements.size() - 1)); } /** @@ -800,17 +811,39 @@ public class Tuple implements Comparable, Iterable { * This function will throw an error if this {@code Tuple} contains an incomplete * {@link Versionstamp}. * - * @return the range of keys containing all {@code Tuple}s that have this {@code Tuple} - * as a prefix + * @return the range of keys containing all possible keys that have this {@code Tuple} + * as a strict prefix */ public Range range() { + return range(null); + } + + /** + * Returns a range representing all keys that encode {@code Tuple}s strictly starting + * with the given prefix followed by this {@code Tuple}. + *
+ *
+ * For example: + *
+	 *   Tuple t = Tuple.from("a", "b");
+	 *   Range r = t.range(Tuple.from("c").pack());
+ * {@code r} contains all tuples ("c", "a", "b", ...) + *
+ * This function will throw an error if this {@code Tuple} contains an incomplete + * {@link Versionstamp}. + * + * @param prefix a byte prefix to precede all elements in the range + * + * @return the range of keys containing all possible keys that have {@code prefix} + * followed by this {@code Tuple} as a strict prefix + */ + public Range range(byte[] prefix) { if(hasIncompleteVersionstamp()) { throw new IllegalStateException("Tuple with incomplete versionstamp used for range"); } - byte[] p = packInternal(null, false); - //System.out.println("Packed tuple is: " + ByteArrayUtil.printable(p)); + byte[] p = packInternal(prefix, false); return new Range(ByteArrayUtil.join(p, new byte[] {0x0}), - ByteArrayUtil.join(p, new byte[] {(byte)0xff})); + ByteArrayUtil.join(p, new byte[] {(byte)0xff})); } /** @@ -823,7 +856,7 @@ public class Tuple implements Comparable, Iterable { * {@code Tuple} */ public boolean hasIncompleteVersionstamp() { - return TupleUtil.hasIncompleteVersionstamp(stream()); + return incompleteVersionstamp; } /** @@ -843,7 +876,21 @@ public class Tuple implements Comparable, Iterable { } int getPackedSize(boolean nested) { - return TupleUtil.getPackedSize(elements, nested); + if(memoizedPackedSize >= 0) { + if(!nested) { + return memoizedPackedSize; + } + int nullCount = 0; + for(Object elem : elements) { + if(elem == null) { + nullCount++; + } + } + return memoizedPackedSize + nullCount; + } + else { + return TupleUtil.getPackedSize(elements, nested); + } } /** @@ -860,7 +907,9 @@ public class Tuple implements Comparable, Iterable { */ @Override public int compareTo(Tuple t) { - if(packed != null && t.packed != null) { + // If either tuple has an incomplete versionstamp, then there is a possibility that the byte order + // is not the semantic comparison order. + if(packed != null && t.packed != null && !hasIncompleteVersionstamp() && !t.hasIncompleteVersionstamp()) { return ByteArrayUtil.compareUnsigned(packed, t.packed); } else { @@ -959,12 +1008,15 @@ public class Tuple implements Comparable, Iterable { * * @return a new {@code Tuple} with the given items as its elements */ - public static Tuple fromItems(Iterable items) { - Tuple t = new Tuple(); - for(Object o : items) { - t = t.addObject(o); + public static Tuple fromItems(Iterable items) { + if(items instanceof List) { + return Tuple.fromList((List)items); } - return t; + List elements = new ArrayList<>(); + for(Object o : items) { + elements.add(o); + } + return new Tuple(elements); } /** @@ -977,8 +1029,9 @@ public class Tuple implements Comparable, Iterable { * * @return a new {@code Tuple} with the given items as its elements */ - public static Tuple fromList(List items) { - return new Tuple(items); + public static Tuple fromList(List items) { + List elements = new ArrayList<>(items); + return new Tuple(elements); } /** @@ -992,10 +1045,8 @@ public class Tuple implements Comparable, Iterable { * * @return a new {@code Tuple} with the given items as its elements */ - public static Tuple fromStream(Stream items) { - Tuple t = new Tuple(); - t.elements = items.collect(Collectors.toList()); - return t; + public static Tuple fromStream(Stream items) { + return new Tuple(items.collect(Collectors.toList())); } /** @@ -1009,7 +1060,7 @@ public class Tuple implements Comparable, Iterable { * @return a new {@code Tuple} with the given items as its elements */ public static Tuple from(Object... items) { - return fromList(Arrays.asList(items)); + return new Tuple(Arrays.asList(items)); } static void main(String[] args) { diff --git a/bindings/java/src/main/com/apple/foundationdb/tuple/TupleUtil.java b/bindings/java/src/main/com/apple/foundationdb/tuple/TupleUtil.java index fc1fbc7262..63a1944b5d 100644 --- a/bindings/java/src/main/com/apple/foundationdb/tuple/TupleUtil.java +++ b/bindings/java/src/main/com/apple/foundationdb/tuple/TupleUtil.java @@ -21,6 +21,7 @@ package com.apple.foundationdb.tuple; import java.math.BigInteger; +import java.nio.BufferOverflowException; import java.nio.ByteBuffer; import java.nio.ByteOrder; import java.nio.charset.Charset; @@ -89,7 +90,7 @@ class TupleUtil { x += 1; } } - throw new IllegalArgumentException("no terminator found for bytes starting at " + from); + throw new IllegalArgumentException("No terminator found for bytes starting at " + from); } } @@ -135,6 +136,7 @@ class TupleUtil { else { ByteArrayUtil.replace(encoded, 0, encoded.length, NULL_ARR, NULL_ESCAPED_ARR, encodedBytes); } + totalLength += encoded.length + nullCount; return this; } @@ -157,6 +159,10 @@ class TupleUtil { } } + private static boolean useOldVersionOffsetFormat() { + return FDB.instance().getAPIVersion() < 520; + } + // These four functions are for adjusting the encoding of floating point numbers so // that when their byte representation is written out in big-endian order, unsigned // lexicographic byte comparison orders the values in the same way as the semantic @@ -165,32 +171,32 @@ class TupleUtil { // in the case that the number is positive. For these purposes, 0.0 is positive and -0.0 // is negative. - static int encodeFloatBits(float f) { + private static int encodeFloatBits(float f) { int intBits = Float.floatToRawIntBits(f); return (intBits < 0) ? (~intBits) : (intBits ^ Integer.MIN_VALUE); } - static long encodeDoubleBits(double d) { + private static long encodeDoubleBits(double d) { long longBits = Double.doubleToRawLongBits(d); return (longBits < 0L) ? (~longBits) : (longBits ^ Long.MIN_VALUE); } - static float decodeFloatBits(int i) { + private static float decodeFloatBits(int i) { int origBits = (i >= 0) ? (~i) : (i ^ Integer.MIN_VALUE); return Float.intBitsToFloat(origBits); } - static double decodeDoubleBits(long l) { + private static double decodeDoubleBits(long l) { long origBits = (l >= 0) ? (~l) : (l ^ Long.MIN_VALUE); return Double.longBitsToDouble(origBits); } // Get the minimal number of bytes in the representation of a long. - static int minimalByteCount(long i) { + private static int minimalByteCount(long i) { return (Long.SIZE + 7 - Long.numberOfLeadingZeros(i >= 0 ? i : -i)) / 8; } - static int minimalByteCount(BigInteger i) { + private static int minimalByteCount(BigInteger i) { int bitLength = (i.compareTo(BigInteger.ZERO) >= 0) ? i.bitLength() : i.negate().bitLength(); return (bitLength + 7) / 8; } @@ -221,7 +227,7 @@ class TupleUtil { } static void adjustVersionPosition(byte[] packed, int delta) { - if(FDB.instance().getAPIVersion() < 520) { + if(useOldVersionOffsetFormat()) { adjustVersionPosition300(packed, delta); } else { @@ -285,7 +291,7 @@ class TupleUtil { else if(t instanceof List) encode(state, (List)t); else if(t instanceof Tuple) - encode(state, ((Tuple)t).getItems()); + encode(state, (Tuple)t); else throw new IllegalArgumentException("Unsupported data type: " + t.getClass().getName()); } @@ -409,6 +415,10 @@ class TupleUtil { state.add(nil); } + static void encode(EncodeState state, Tuple value) { + encode(state, value.elements); + } + static void decode(DecodeState state, byte[] rep, int pos, int last) { //System.out.println("Decoding '" + ArrayUtils.printable(rep) + "' at " + pos); @@ -491,8 +501,8 @@ class TupleUtil { int n = positive ? code - INT_ZERO_CODE : INT_ZERO_CODE - code; int end = start + n; - if(rep.length < last) { - throw new RuntimeException("Invalid tuple (possible truncation)"); + if(last < end) { + throw new IllegalArgumentException("Invalid tuple (possible truncation)"); } if(positive && (n < Long.BYTES || rep[start] > 0)) { @@ -530,12 +540,16 @@ class TupleUtil { } } else if(code == VERSIONSTAMP_CODE) { + if(start + Versionstamp.LENGTH > last) { + throw new IllegalArgumentException("Invalid tuple (possible truncation)"); + } Versionstamp val = Versionstamp.fromBytes(Arrays.copyOfRange(rep, start, start + Versionstamp.LENGTH)); state.add(val, start + Versionstamp.LENGTH); } else if(code == NESTED_CODE) { DecodeState subResult = new DecodeState(); int endPos = start; + boolean foundEnd = false; while(endPos < last) { if(rep[endPos] == nil) { if(endPos + 1 < last && rep[endPos+1] == (byte)0xff) { @@ -543,6 +557,7 @@ class TupleUtil { endPos += 2; } else { endPos += 1; + foundEnd = true; break; } } else { @@ -550,6 +565,9 @@ class TupleUtil { endPos = subResult.end; } } + if(!foundEnd) { + throw new IllegalArgumentException("No terminator found for nested tuple starting at " + start); + } state.add(subResult.values, endPos); } else { @@ -558,6 +576,10 @@ class TupleUtil { } static int compareItems(Object item1, Object item2) { + if(item1 == item2) { + // If we have pointer equality, just return 0 immediately. + return 0; + } int code1 = TupleUtil.getCodeFor(item1); int code2 = TupleUtil.getCodeFor(item2); @@ -603,14 +625,14 @@ class TupleUtil { } } if(code1 == FLOAT_CODE) { - // This is done for the same reason that double comparison is done - // that way. + // This is done over vanilla float comparison basically to handle NaNs + // sorting correctly. int fbits1 = encodeFloatBits((Float)item1); int fbits2 = encodeFloatBits((Float)item2); return Integer.compareUnsigned(fbits1, fbits2); } if(code1 == DOUBLE_CODE) { - // This is done over vanilla double comparison basically to handle NaN + // This is done over vanilla double comparison basically to handle NaNs // sorting correctly. long dbits1 = encodeDoubleBits((Double)item1); long dbits2 = encodeDoubleBits((Double)item2); @@ -637,58 +659,57 @@ class TupleUtil { throw new IllegalArgumentException("Unknown tuple data type: " + item1.getClass()); } - static List unpack(byte[] bytes, int start, int length) { - DecodeState decodeState = new DecodeState(); - int pos = start; - int end = start + length; - while(pos < end) { - decode(decodeState, bytes, pos, end); - pos = decodeState.end; + static List unpack(byte[] bytes) { + try { + DecodeState decodeState = new DecodeState(); + int pos = 0; + int end = bytes.length; + while (pos < end) { + decode(decodeState, bytes, pos, end); + pos = decodeState.end; + } + return decodeState.values; + } + catch(IndexOutOfBoundsException | BufferOverflowException e) { + throw new IllegalArgumentException("Invalid tuple (possible truncation)", e); } - return decodeState.values; } - static void encodeAll(EncodeState state, List items, byte[] prefix) { - if(prefix != null) { - state.add(prefix); - } + static void encodeAll(EncodeState state, List items) { for(Object t : items) { encode(state, t); } - //System.out.println("Joining whole tuple..."); } - static byte[] pack(List items, byte[] prefix, int expectedSize) { - ByteBuffer dest = ByteBuffer.allocate(expectedSize + (prefix != null ? prefix.length : 0)); + static void pack(ByteBuffer dest, List items) { + ByteOrder origOrder = dest.order(); EncodeState state = new EncodeState(dest); - if(prefix != null) { - state.add(prefix); - } - encodeAll(state, items, prefix); + encodeAll(state, items); + dest.order(origOrder); if(state.versionPos >= 0) { - throw new IllegalArgumentException("Incomplete Versionstamp included in vanilla tuple packInternal"); - } - else { - return dest.array(); + throw new IllegalArgumentException("Incomplete Versionstamp included in vanilla tuple pack"); } } - static byte[] packWithVersionstamp(List items, byte[] prefix, int expectedSize) { - ByteBuffer dest = ByteBuffer.allocate(expectedSize + (prefix != null ? prefix.length : 0)); + static byte[] pack(List items, int expectedSize) { + ByteBuffer dest = ByteBuffer.allocate(expectedSize); + pack(dest, items); + return dest.array(); + } + + static byte[] packWithVersionstamp(List items, int expectedSize) { + ByteBuffer dest = ByteBuffer.allocate(expectedSize); EncodeState state = new EncodeState(dest); - if(prefix != null) { - state.add(prefix); - } - encodeAll(state, items, prefix); + encodeAll(state, items); if(state.versionPos < 0) { throw new IllegalArgumentException("No incomplete Versionstamp included in tuple packInternal with versionstamp"); } else { - if(state.versionPos > 0xffff) { + if(useOldVersionOffsetFormat() && state.versionPos > 0xffff) { throw new IllegalArgumentException("Tuple has incomplete version at position " + state.versionPos + " which is greater than the maximum " + 0xffff); } dest.order(ByteOrder.LITTLE_ENDIAN); - if (FDB.instance().getAPIVersion() < 520) { + if (useOldVersionOffsetFormat()) { dest.putShort((short)state.versionPos); } else { dest.putInt(state.versionPos); @@ -740,7 +761,7 @@ class TupleUtil { packedSize += 1 + Versionstamp.LENGTH; Versionstamp versionstamp = (Versionstamp)item; if(!versionstamp.isComplete()) { - int suffixSize = FDB.instance().getAPIVersion() < 520 ? Short.BYTES : Integer.BYTES; + int suffixSize = useOldVersionOffsetFormat() ? Short.BYTES : Integer.BYTES; packedSize += suffixSize; } } @@ -776,7 +797,7 @@ class TupleUtil { public static void main(String[] args) { try { - byte[] bytes = pack(Collections.singletonList(4), null, 2); + byte[] bytes = pack(Collections.singletonList(4), 2); DecodeState result = new DecodeState(); decode(result, bytes, 0, bytes.length); int val = ((Number)result.values.get(0)).intValue(); @@ -788,7 +809,7 @@ class TupleUtil { } try { - byte[] bytes = pack(Collections.singletonList("\u021Aest \u0218tring"), null, 15); + byte[] bytes = pack(Collections.singletonList("\u021Aest \u0218tring"), 15); DecodeState result = new DecodeState(); decode(result, bytes, 0, bytes.length); String string = (String)result.values.get(0); diff --git a/bindings/java/src/main/com/apple/foundationdb/tuple/Versionstamp.java b/bindings/java/src/main/com/apple/foundationdb/tuple/Versionstamp.java index 85c6de37ae..07c3218eac 100644 --- a/bindings/java/src/main/com/apple/foundationdb/tuple/Versionstamp.java +++ b/bindings/java/src/main/com/apple/foundationdb/tuple/Versionstamp.java @@ -94,8 +94,8 @@ public class Versionstamp implements Comparable { private static final byte[] UNSET_TRANSACTION_VERSION = {(byte)0xff, (byte)0xff, (byte)0xff, (byte)0xff, (byte)0xff, (byte)0xff, (byte)0xff, (byte)0xff, (byte)0xff, (byte)0xff}; - private boolean complete; - private byte[] versionBytes; + private final boolean complete; + private final byte[] versionBytes; /** * From a byte array, unpack the user version starting at the given position. diff --git a/bindings/java/src/test/com/apple/foundationdb/test/TupleTest.java b/bindings/java/src/test/com/apple/foundationdb/test/TupleTest.java index 2f0fd1c2c4..ac2b033748 100644 --- a/bindings/java/src/test/com/apple/foundationdb/test/TupleTest.java +++ b/bindings/java/src/test/com/apple/foundationdb/test/TupleTest.java @@ -21,13 +21,21 @@ package com.apple.foundationdb.test; import java.math.BigInteger; +import java.nio.BufferOverflowException; +import java.nio.ByteBuffer; +import java.nio.ByteOrder; import java.util.ArrayList; import java.util.Arrays; +import java.util.Collections; import java.util.List; import java.util.Objects; import java.util.UUID; +import java.util.stream.Stream; +import com.apple.foundationdb.Database; +import com.apple.foundationdb.FDB; import com.apple.foundationdb.TransactionContext; +import com.apple.foundationdb.subspace.Subspace; import com.apple.foundationdb.tuple.ByteArrayUtil; import com.apple.foundationdb.tuple.Tuple; import com.apple.foundationdb.tuple.Versionstamp; @@ -38,15 +46,19 @@ public class TupleTest { public static void main(String[] args) throws InterruptedException { final int reps = 1000; try { - // FDB fdb = FDB.selectAPIVersion(610); - serializedForms(); + FDB fdb = FDB.selectAPIVersion(610); + addMethods(); comparisons(); + emptyTuple(); + incompleteVersionstamps(); + intoBuffer(); + offsetsAndLengths(); + malformedBytes(); replaceTests(); - /* + serializedForms(); try(Database db = fdb.open()) { runTests(reps, db); } - */ } catch(Throwable t) { t.printStackTrace(); } @@ -269,6 +281,606 @@ public class TupleTest { } } + private static void emptyTuple() { + Tuple t = new Tuple(); + if(!t.isEmpty()) { + throw new RuntimeException("empty tuple is not empty"); + } + if(t.getPackedSize() != 0) { + throw new RuntimeException("empty tuple packed size is not 0"); + } + if(t.pack().length != 0) { + throw new RuntimeException("empty tuple is not packed to the empty byte string"); + } + } + + private static void addMethods() { + List baseTuples = Arrays.asList( + new Tuple(), + Tuple.from(), + Tuple.from((Object)null), + Tuple.from("prefix"), + Tuple.from("prefix", null), + Tuple.from(new UUID(100, 1000)), + Tuple.from(Versionstamp.incomplete(1)), + Tuple.from(Tuple.from(Versionstamp.incomplete(2))), + Tuple.from(Collections.singletonList(Versionstamp.incomplete(3))) + ); + List toAdd = Arrays.asList( + null, + 1066L, + BigInteger.valueOf(1066), + -3.14f, + 2.71828, + new byte[]{0x01, 0x02, 0x03}, + new byte[]{0x01, 0x00, 0x02, 0x00, 0x03}, + "hello there", + "hell\0 there", + "\ud83d\udd25", + "\ufb14", + false, + true, + Float.NaN, + Float.intBitsToFloat(Integer.MAX_VALUE), + Double.NaN, + Double.longBitsToDouble(Long.MAX_VALUE), + Versionstamp.complete(new byte[]{0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09}, 100), + Versionstamp.incomplete(4), + new UUID(-1, 1), + Tuple.from((Object)null), + Tuple.from("suffix", "tuple"), + Tuple.from("s\0ffix", "tuple"), + Arrays.asList("suffix", "tuple"), + Arrays.asList("suffix", null, "tuple"), + Tuple.from("suffix", null, "tuple"), + Tuple.from("suffix", Versionstamp.incomplete(4), "tuple"), + Arrays.asList("suffix", Arrays.asList("inner", Versionstamp.incomplete(5), "tuple"), "tuple") + ); + + for(Tuple baseTuple : baseTuples) { + for(Object newItem : toAdd) { + int baseSize = baseTuple.size(); + Tuple freshTuple = Tuple.fromStream(Stream.concat(baseTuple.stream(), Stream.of(newItem))); + if(freshTuple.size() != baseSize + 1) { + throw new RuntimeException("freshTuple size was not one larger than base size"); + } + Tuple withObjectAdded = baseTuple.addObject(newItem); + if(withObjectAdded.size() != baseSize + 1) { + throw new RuntimeException("withObjectAdded size was not one larger than the base size"); + } + // Use the appropriate "add" overload. + Tuple withValueAdded; + if(newItem == null) { + withValueAdded = baseTuple.addObject(null); + } + else if(newItem instanceof byte[]) { + withValueAdded = baseTuple.add((byte[])newItem); + } + else if(newItem instanceof String) { + withValueAdded = baseTuple.add((String)newItem); + } + else if(newItem instanceof Long) { + withValueAdded = baseTuple.add((Long)newItem); + } + else if(newItem instanceof BigInteger) { + withValueAdded = baseTuple.add((BigInteger)newItem); + } + else if(newItem instanceof Float) { + withValueAdded = baseTuple.add((Float)newItem); + } + else if(newItem instanceof Double) { + withValueAdded = baseTuple.add((Double)newItem); + } + else if(newItem instanceof Boolean) { + withValueAdded = baseTuple.add((Boolean)newItem); + } + else if(newItem instanceof UUID) { + withValueAdded = baseTuple.add((UUID)newItem); + } + else if(newItem instanceof Versionstamp) { + withValueAdded = baseTuple.add((Versionstamp)newItem); + } + else if(newItem instanceof List) { + withValueAdded = baseTuple.add((List)newItem); + } + else if(newItem instanceof Tuple) { + withValueAdded = baseTuple.add((Tuple)newItem); + } + else { + throw new RuntimeException("unknown type for tuple serialization " + newItem.getClass()); + } + // Use Tuple.addAll, which has optimizations if both tuples have been packed already + // Getting their hash codes memoizes the packed representation. + Tuple newItemTuple = Tuple.from(newItem); + baseTuple.hashCode(); + newItemTuple.hashCode(); + Tuple withTupleAddedAll = baseTuple.addAll(newItemTuple); + Tuple withListAddedAll = baseTuple.addAll(Collections.singletonList(newItem)); + List allTuples = Arrays.asList(freshTuple, withObjectAdded, withValueAdded, withTupleAddedAll, withListAddedAll); + + int basePlusNewSize = baseTuple.getPackedSize() + Tuple.from(newItem).getPackedSize(); + int freshTuplePackedSize = freshTuple.getPackedSize(); + int withObjectAddedPackedSize = withObjectAdded.getPackedSize(); + int withValueAddedPackedSize = withValueAdded.getPackedSize(); + int withTupleAddedAllPackedSize = withTupleAddedAll.getPackedSize(); + int withListAddAllPackedSize = withListAddedAll.getPackedSize(); + if(basePlusNewSize != freshTuplePackedSize || basePlusNewSize != withObjectAddedPackedSize || + basePlusNewSize != withValueAddedPackedSize || basePlusNewSize != withTupleAddedAllPackedSize || + basePlusNewSize != withListAddAllPackedSize) { + throw new RuntimeException("packed sizes not equivalent"); + } + byte[] concatPacked; + byte[] prefixPacked; + byte[] freshPacked; + byte[] objectAddedPacked; + byte[] valueAddedPacked; + byte[] tupleAddedAllPacked; + byte[] listAddedAllPacked; + if(!baseTuple.hasIncompleteVersionstamp() && !Tuple.from(newItem).hasIncompleteVersionstamp()) { + concatPacked = ByteArrayUtil.join(baseTuple.pack(), Tuple.from(newItem).pack()); + prefixPacked = Tuple.from(newItem).pack(baseTuple.pack()); + freshPacked = freshTuple.pack(); + objectAddedPacked = withObjectAdded.pack(); + valueAddedPacked = withValueAdded.pack(); + tupleAddedAllPacked = withTupleAddedAll.pack(); + listAddedAllPacked = withListAddedAll.pack(); + + for(Tuple t : allTuples) { + try { + t.packWithVersionstamp(); + throw new RuntimeException("able to pack tuple without incomplete versionstamp using packWithVersionstamp"); + } + catch(IllegalArgumentException e) { + // eat + } + } + } + else if(!baseTuple.hasIncompleteVersionstamp() && Tuple.from(newItem).hasIncompleteVersionstamp()) { + concatPacked = newItemTuple.packWithVersionstamp(baseTuple.pack()); + try { + prefixPacked = Tuple.from(newItem).packWithVersionstamp(baseTuple.pack()); + } + catch(NullPointerException e) { + prefixPacked = Tuple.from(newItem).packWithVersionstamp(baseTuple.pack()); + } + freshPacked = freshTuple.packWithVersionstamp(); + objectAddedPacked = withObjectAdded.packWithVersionstamp(); + valueAddedPacked = withValueAdded.packWithVersionstamp(); + tupleAddedAllPacked = withTupleAddedAll.packWithVersionstamp(); + listAddedAllPacked = withListAddedAll.packWithVersionstamp(); + + for(Tuple t : allTuples) { + try { + t.pack(); + throw new RuntimeException("able to pack tuple with incomplete versionstamp"); + } + catch(IllegalArgumentException e) { + // eat + } + } + } + else if(baseTuple.hasIncompleteVersionstamp() && !Tuple.from(newItem).hasIncompleteVersionstamp()) { + concatPacked = baseTuple.addAll(Tuple.from(newItem)).packWithVersionstamp(); + prefixPacked = baseTuple.addObject(newItem).packWithVersionstamp(); + freshPacked = freshTuple.packWithVersionstamp(); + objectAddedPacked = withObjectAdded.packWithVersionstamp(); + valueAddedPacked = withValueAdded.packWithVersionstamp(); + tupleAddedAllPacked = withTupleAddedAll.packWithVersionstamp(); + listAddedAllPacked = withListAddedAll.packWithVersionstamp(); + + for(Tuple t : allTuples) { + try { + t.pack(); + throw new RuntimeException("able to pack tuple with incomplete versionstamp"); + } + catch(IllegalArgumentException e) { + // eat + } + } + } + else { + for(Tuple t : allTuples) { + try { + t.pack(); + throw new RuntimeException("able to pack tuple with two versionstamps using pack"); + } + catch(IllegalArgumentException e) { + // eat + } + try { + t.packWithVersionstamp(); + throw new RuntimeException("able to pack tuple with two versionstamps using packWithVersionstamp"); + } + catch(IllegalArgumentException e) { + // eat + } + try { + t.hashCode(); + throw new RuntimeException("able to get hash code of tuple with two versionstamps"); + } + catch(IllegalArgumentException e) { + // eat + } + } + concatPacked = null; + prefixPacked = null; + freshPacked = null; + objectAddedPacked = null; + valueAddedPacked = null; + tupleAddedAllPacked = null; + listAddedAllPacked = null; + } + if(!Arrays.equals(concatPacked, freshPacked) || + !Arrays.equals(freshPacked, prefixPacked) || + !Arrays.equals(freshPacked, objectAddedPacked) || + !Arrays.equals(freshPacked, valueAddedPacked) || + !Arrays.equals(freshPacked, tupleAddedAllPacked) || + !Arrays.equals(freshPacked, listAddedAllPacked)) { + throw new RuntimeException("packed values are not concatenation of original packings"); + } + if(freshPacked != null && freshPacked.length != basePlusNewSize) { + throw new RuntimeException("packed length did not match expectation"); + } + if(freshPacked != null) { + if(freshTuple.hashCode() != Arrays.hashCode(freshPacked)) { + throw new IllegalArgumentException("hash code does not match fresh packed"); + } + for(Tuple t : allTuples) { + if(t.hashCode() != freshTuple.hashCode()) { + throw new IllegalArgumentException("hash code mismatch"); + } + if(Tuple.fromItems(t.getItems()).hashCode() != freshTuple.hashCode()) { + throw new IllegalArgumentException("hash code mismatch after re-compute"); + } + } + } + } + } + } + + private static void incompleteVersionstamps() { + if(FDB.instance().getAPIVersion() < 520) { + throw new IllegalStateException("cannot run test with API version " + FDB.instance().getAPIVersion()); + } + // This is a tricky case where there are two tuples with identical representations but different semantics. + byte[] arr = new byte[0x0100fe]; + Arrays.fill(arr, (byte)0x7f); // The actual value doesn't matter, but it can't be zero. + Tuple t1 = Tuple.from(arr, Versionstamp.complete(new byte[]{FF, FF, FF, FF, FF, FF, FF, FF, FF, FF}), new byte[]{0x01, 0x01}); + Tuple t2 = Tuple.from(arr, Versionstamp.incomplete()); + if(t1.equals(t2)) { + throw new RuntimeException("tuples " + t1 + " and " + t2 + " compared equal"); + } + byte[] bytes1 = t1.pack(); + byte[] bytes2 = t2.packWithVersionstamp(); + if(!Arrays.equals(bytes1, bytes2)) { + throw new RuntimeException("tuples " + t1 + " and " + t2 + " did not have matching representations"); + } + if(t1.equals(t2)) { + throw new RuntimeException("tuples " + t1 + " and " + t2 + " compared equal with memoized packed representations"); + } + + // Make sure position information adjustment works. + Tuple t3 = Tuple.from(Versionstamp.incomplete(1)); + if(t3.getPackedSize() != 1 + Versionstamp.LENGTH + Integer.BYTES) { + throw new RuntimeException("incomplete versionstamp has incorrect packed size " + t3.getPackedSize()); + } + byte[] bytes3 = t3.packWithVersionstamp(); + if(ByteBuffer.wrap(bytes3, bytes3.length - Integer.BYTES, Integer.BYTES).order(ByteOrder.LITTLE_ENDIAN).getInt() != 1) { + throw new RuntimeException("incomplete versionstamp has incorrect position"); + } + if(!Tuple.fromBytes(bytes3, 0, bytes3.length - Integer.BYTES).equals(Tuple.from(Versionstamp.incomplete(1)))) { + throw new RuntimeException("unpacked bytes did not match"); + } + Subspace subspace = new Subspace(Tuple.from("prefix")); + byte[] bytes4 = subspace.packWithVersionstamp(t3); + if(ByteBuffer.wrap(bytes4, bytes4.length - Integer.BYTES, Integer.BYTES).order(ByteOrder.LITTLE_ENDIAN).getInt() != 1 + subspace.getKey().length) { + throw new RuntimeException("incomplete versionstamp has incorrect position with prefix"); + } + if(!Tuple.fromBytes(bytes4, 0, bytes4.length - Integer.BYTES).equals(Tuple.from("prefix", Versionstamp.incomplete(1)))) { + throw new RuntimeException("unpacked bytes with subspace did not match"); + } + try { + // At this point, the representation is cached, so an easy bug would be to have it return the already serialized value + t3.pack(); + throw new RuntimeException("was able to pack versionstamp with incomplete versionstamp"); + } catch(IllegalArgumentException e) { + // eat + } + + // Tuples with two incomplete versionstamps somewhere. + List twoIncompleteList = Arrays.asList( + Tuple.from(Versionstamp.incomplete(1), Versionstamp.incomplete(2)), + Tuple.from(Tuple.from(Versionstamp.incomplete(3)), Tuple.from(Versionstamp.incomplete(4))), + new Tuple().add(Versionstamp.incomplete()).add(Versionstamp.incomplete()), + new Tuple().add(Versionstamp.incomplete()).add(3L).add(Versionstamp.incomplete()), + Tuple.from(Tuple.from(Versionstamp.incomplete()), "dummy_string").add(Tuple.from(Versionstamp.incomplete())), + Tuple.from(Arrays.asList(Versionstamp.incomplete(), "dummy_string")).add(Tuple.from(Versionstamp.incomplete())), + Tuple.from(Tuple.from(Versionstamp.incomplete()), "dummy_string").add(Collections.singletonList(Versionstamp.incomplete())) + ); + for(Tuple t : twoIncompleteList) { + if(!t.hasIncompleteVersionstamp()) { + throw new RuntimeException("tuple doesn't think it has incomplete versionstamp"); + } + if(t.getPackedSize() < 2 * (1 + Versionstamp.LENGTH + Integer.BYTES)) { + throw new RuntimeException("tuple packed size " + t.getPackedSize() + " is smaller than expected"); + } + try { + t.pack(); + throw new RuntimeException("no error thrown when packing any incomplete versionstamps"); + } + catch(IllegalArgumentException e) { + // eat + } + try { + t.packWithVersionstamp(); + throw new RuntimeException("no error thrown when packing with versionstamp with two incompletes"); + } + catch(IllegalArgumentException e) { + // eat + } + } + } + + // Assumes API version < 520 + private static void incompleteVersionstamps300() { + if(FDB.instance().getAPIVersion() >= 520) { + throw new IllegalStateException("cannot run test with API version " + FDB.instance().getAPIVersion()); + } + Tuple t1 = Tuple.from(Versionstamp.complete(new byte[]{FF, FF, FF, FF, FF, FF, FF, FF, FF, FF}), new byte[]{}); + Tuple t2 = Tuple.from(Versionstamp.incomplete()); + if(t1.equals(t2)) { + throw new RuntimeException("tuples " + t1 + " and " + t2 + " compared equal"); + } + byte[] bytes1 = t1.pack(); + byte[] bytes2 = t2.packWithVersionstamp(); + if(!Arrays.equals(bytes1, bytes2)) { + throw new RuntimeException("tuples " + t1 + " and " + t2 + " did not have matching representations"); + } + if(t1.equals(t2)) { + throw new RuntimeException("tuples " + t1 + " and " + t2 + " compared equal with memoized packed representations"); + } + + // Make sure position information adjustment works. + Tuple t3 = Tuple.from(Versionstamp.incomplete(1)); + if(t3.getPackedSize() != 1 + Versionstamp.LENGTH + Short.BYTES) { + throw new RuntimeException("incomplete versionstamp has incorrect packed size " + t3.getPackedSize()); + } + byte[] bytes3 = t3.packWithVersionstamp(); + if(ByteBuffer.wrap(bytes3, bytes3.length - Short.BYTES, Short.BYTES).order(ByteOrder.LITTLE_ENDIAN).getShort() != 1) { + throw new RuntimeException("incomplete versionstamp has incorrect position"); + } + if(!Tuple.fromBytes(bytes3, 0, bytes3.length - Short.BYTES).equals(Tuple.from(Versionstamp.incomplete(1)))) { + throw new RuntimeException("unpacked bytes did not match"); + } + Subspace subspace = new Subspace(Tuple.from("prefix")); + byte[] bytes4 = subspace.packWithVersionstamp(t3); + if(ByteBuffer.wrap(bytes4, bytes4.length - Short.BYTES, Short.BYTES).order(ByteOrder.LITTLE_ENDIAN).getShort() != 1 + subspace.getKey().length) { + throw new RuntimeException("incomplete versionstamp has incorrect position with prefix"); + } + if(!Tuple.fromBytes(bytes4, 0, bytes4.length - Short.BYTES).equals(Tuple.from("prefix", Versionstamp.incomplete(1)))) { + throw new RuntimeException("unpacked bytes with subspace did not match"); + } + + // Make sure an offset > 0xFFFF throws an error. + Tuple t4 = Tuple.from(Versionstamp.incomplete(2)); + byte[] bytes5 = t4.packWithVersionstamp(); // Get bytes memoized. + if(ByteBuffer.wrap(bytes5, bytes5.length - Short.BYTES, Short.BYTES).order(ByteOrder.LITTLE_ENDIAN).getShort() != 1) { + throw new RuntimeException("incomplete versionstamp has incorrect position with prefix"); + } + byte[] bytes6 = t4.packWithVersionstamp(new byte[0xfffe]); // Offset is 0xffff + if(!Arrays.equals(Arrays.copyOfRange(bytes5, 0, 1 + Versionstamp.LENGTH), Arrays.copyOfRange(bytes6, 0xfffe, 0xffff + Versionstamp.LENGTH))) { + throw new RuntimeException("area before versionstamp offset did not match"); + } + if((ByteBuffer.wrap(bytes6, bytes6.length - Short.BYTES, Short.BYTES).order(ByteOrder.LITTLE_ENDIAN).getShort() & 0xffff) != 0xffff) { + throw new RuntimeException("incomplete versionstamp has incorrect position with prefix"); + } + try { + t4.packWithVersionstamp(new byte[0xffff]); // Offset is 0x10000 + throw new RuntimeException("able to pack versionstamp with offset that is too large"); + } + catch(IllegalArgumentException e) { + // eat + } + // Same as before, but packed representation is not memoized. + try { + Tuple.from(Versionstamp.incomplete(3)).packWithVersionstamp(new byte[0xffff]); // Offset is 0x10000 + throw new RuntimeException("able to pack versionstamp with offset that is too large"); + } + catch(IllegalArgumentException e) { + // eat + } + } + + private static void malformedBytes() { + List malformedSequences = Arrays.asList( + new byte[]{0x01, (byte)0xde, (byte)0xad, (byte)0xc0, (byte)0xde}, // no termination character for byte array + new byte[]{0x01, (byte)0xde, (byte)0xad, 0x00, FF, (byte)0xc0, (byte)0xde}, // no termination character but null in middle + new byte[]{0x02, 'h', 'e', 'l', 'l', 'o'}, // no termination character for string + new byte[]{0x02, 'h', 'e', 'l', 0x00, FF, 'l', 'o'}, // no termination character but null in the middle + // Invalid UTF-8 decodes malformed as U+FFFD rather than throwing an error + // new byte[]{0x02, 'u', 't', 'f', 0x08, (byte)0x80, 0x00}, // invalid utf-8 code point start character + // new byte[]{0x02, 'u', 't', 'f', 0x08, (byte)0xc0, 0x01, 0x00}, // invalid utf-8 code point second character + new byte[]{0x05, 0x02, 'h', 'e', 'l', 'l', 'o', 0x00}, // no termination character for nested tuple + new byte[]{0x05, 0x02, 'h', 'e', 'l', 'l', 'o', 0x00, 0x00, FF, 0x02, 't', 'h', 'e', 'r', 'e', 0x00}, // no termination character for nested tuple but null in the middle + new byte[]{0x16, 0x01}, // integer truncation + new byte[]{0x12, 0x01}, // integer truncation + new byte[]{0x1d, 0x09, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08}, // integer truncation + new byte[]{0x0b, 0x09 ^ FF, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08}, // integer truncation + new byte[]{0x20, 0x01, 0x02, 0x03}, // float truncation + new byte[]{0x21, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07}, // double truncation + new byte[]{0x30, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e}, // UUID truncation + new byte[]{0x33, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0a, 0x0b}, // versionstamp truncation + new byte[]{FF} // unknown start code + ); + for(byte[] sequence : malformedSequences) { + try { + Tuple t = Tuple.fromBytes(sequence); + throw new RuntimeException("Able to unpack " + ByteArrayUtil.printable(sequence) + " into " + t); + } + catch(IllegalArgumentException e) { + System.out.println("Error for " + ByteArrayUtil.printable(sequence) + ": " + e.getMessage()); + } + } + + // Perfectly good byte sequences, but using the offset and length to remove terminal bytes + List wellFormedSequences = Arrays.asList( + Tuple.from((Object)new byte[]{0x01, 0x02}).pack(), + Tuple.from("hello").pack(), + Tuple.from("hell\0").pack(), + Tuple.from(1066L).pack(), + Tuple.from(-1066L).pack(), + Tuple.from(BigInteger.ONE.shiftLeft(Long.SIZE + 1)).pack(), + Tuple.from(BigInteger.ONE.shiftLeft(Long.SIZE + 1).negate()).pack(), + Tuple.from(-3.14f).pack(), + Tuple.from(2.71828).pack(), + Tuple.from(new UUID(1066L, 1415L)).pack(), + Tuple.from(Versionstamp.fromBytes(new byte[]{0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c})).pack() + ); + for(byte[] sequence : wellFormedSequences) { + try { + Tuple t = Tuple.fromBytes(sequence, 0, sequence.length - 1); + throw new RuntimeException("Able to unpack " + ByteArrayUtil.printable(sequence) + " into " + t + " without last character"); + } + catch(IllegalArgumentException e) { + System.out.println("Error for " + ByteArrayUtil.printable(sequence) + ": " + e.getMessage()); + } + } + } + + private static void offsetsAndLengths() { + List tuples = Arrays.asList( + new Tuple(), + Tuple.from((Object)null), + Tuple.from(null, new byte[]{0x10, 0x66}), + Tuple.from("dummy_string"), + Tuple.from(1066L) + ); + Tuple allTuples = tuples.stream().reduce(new Tuple(), Tuple::addAll); + byte[] allTupleBytes = allTuples.pack(); + + // Unpack each tuple individually using their lengths + int offset = 0; + for(Tuple t : tuples) { + int length = t.getPackedSize(); + Tuple unpacked = Tuple.fromBytes(allTupleBytes, offset, length); + if(!unpacked.equals(t)) { + throw new RuntimeException("unpacked tuple " + unpacked + " does not match serialized tuple " + t); + } + offset += length; + } + + // Unpack successive pairs of tuples. + offset = 0; + for(int i = 0; i < tuples.size() - 1; i++) { + Tuple combinedTuple = tuples.get(i).addAll(tuples.get(i + 1)); + Tuple unpacked = Tuple.fromBytes(allTupleBytes, offset, combinedTuple.getPackedSize()); + if(!unpacked.equals(combinedTuple)) { + throw new RuntimeException("unpacked tuple " + unpacked + " does not match combined tuple " + combinedTuple); + } + offset += tuples.get(i).getPackedSize(); + } + + // Allow an offset to equal the length of the array, but essentially only a zero-length is allowed there. + Tuple emptyAtEndTuple = Tuple.fromBytes(allTupleBytes, allTupleBytes.length, 0); + if(!emptyAtEndTuple.isEmpty()) { + throw new RuntimeException("tuple with no bytes is not empty"); + } + + try { + Tuple.fromBytes(allTupleBytes, -1, 4); + throw new RuntimeException("able to give negative offset to fromBytes"); + } + catch(IllegalArgumentException e) { + // eat + } + try { + Tuple.fromBytes(allTupleBytes, allTupleBytes.length + 1, 4); + throw new RuntimeException("able to give offset larger than array to fromBytes"); + } + catch(IllegalArgumentException e) { + // eat + } + try { + Tuple.fromBytes(allTupleBytes, 0, -1); + throw new RuntimeException("able to give negative length to fromBytes"); + } + catch(IllegalArgumentException e) { + // eat + } + try { + Tuple.fromBytes(allTupleBytes, 0, allTupleBytes.length + 1); + throw new RuntimeException("able to give length larger than array to fromBytes"); + } + catch(IllegalArgumentException e) { + // eat + } + try { + Tuple.fromBytes(allTupleBytes, allTupleBytes.length / 2, allTupleBytes.length / 2 + 2); + throw new RuntimeException("able to exceed array length in fromBytes"); + } + catch(IllegalArgumentException e) { + // eat + } + } + + private static void intoBuffer() { + Tuple t = Tuple.from("hello", 3.14f, "world"); + ByteBuffer buffer = ByteBuffer.allocate("hello".length() + 2 + Float.BYTES + 1 + "world".length() + 2); + t.packInto(buffer); + if(!Arrays.equals(t.pack(), buffer.array())) { + throw new RuntimeException("buffer and tuple do not match"); + } + + buffer = ByteBuffer.allocate(t.getPackedSize() + 2); + buffer.order(ByteOrder.LITTLE_ENDIAN); + t.packInto(buffer); + if(!Arrays.equals(ByteArrayUtil.join(t.pack(), new byte[]{0x00, 0x00}), buffer.array())) { + throw new RuntimeException("buffer and tuple do not match"); + } + if(!buffer.order().equals(ByteOrder.LITTLE_ENDIAN)) { + throw new RuntimeException("byte order changed"); + } + + buffer = ByteBuffer.allocate(t.getPackedSize() + 2); + buffer.put((byte)0x01).put((byte)0x02); + t.packInto(buffer); + if(!Arrays.equals(t.pack(new byte[]{0x01, 0x02}), buffer.array())) { + throw new RuntimeException("buffer and tuple do not match"); + } + + buffer = ByteBuffer.allocate(t.getPackedSize() - 1); + try { + t.packInto(buffer); + throw new RuntimeException("able to pack into buffer that was too small"); + } + catch(BufferOverflowException e) { + // eat + } + + Tuple tCopy = Tuple.fromItems(t.getItems()); // remove memoized stuff + buffer = ByteBuffer.allocate(t.getPackedSize() - 1); + try { + tCopy.packInto(buffer); + throw new RuntimeException("able to pack into buffer that was too small"); + } + catch(BufferOverflowException e) { + // eat + } + + Tuple tWithIncomplete = Tuple.from(Versionstamp.incomplete(3)); + buffer = ByteBuffer.allocate(tWithIncomplete.getPackedSize()); + try { + tWithIncomplete.packInto(buffer); + throw new RuntimeException("able to pack incomplete versionstamp into buffer"); + } + catch(IllegalArgumentException e) { + // eat + } + if(buffer.arrayOffset() != 0) { + throw new RuntimeException("offset changed after unsuccessful pack with incomplete versionstamp"); + } + } + // These should be in ArrayUtilTest, but those can't be run at the moment, so here they go. private static void replaceTests() { List arrays = Arrays.asList( From a1c32ce057f714761e3a3614db2b07497acb8fb9 Mon Sep 17 00:00:00 2001 From: Alec Grieser Date: Thu, 28 Feb 2019 09:35:04 -0800 Subject: [PATCH 29/46] update release notes with Tuple improvements --- documentation/sphinx/source/release-notes.rst | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/documentation/sphinx/source/release-notes.rst b/documentation/sphinx/source/release-notes.rst index 606e63d229..a6e03e7ee2 100644 --- a/documentation/sphinx/source/release-notes.rst +++ b/documentation/sphinx/source/release-notes.rst @@ -40,10 +40,15 @@ Bindings * Java: Deprecated ``FDB.createCluster`` and ``Cluster``. The preferred way to get a ``Database`` is by using ``FDB.open``, which should work in both new and old API versions. `(PR #942) `_ * Java: Removed ``Cluster(long cPtr, Executor executor)`` constructor. This is API breaking for any code that has subclassed the ``Cluster`` class and is not protected by API versioning. `(PR #942) `_ * Java: Several methods relevant to read-only transactions have been moved into the ``ReadTransaction`` interface. +* Java: Tuples now cache previous hash codes and equality checking no longer requires packing the underlying Tuples. `(PR #1166) `_ +* Java: Tuple performance has been improved to use fewer allocations when packing and unpacking. `(Issue #1206) `_ +* Java: Unpacking a Tuple with a byte array or string that is missing the end-of-string character now throws an error. `(Issue #671) `_ +* Java: Unpacking a Tuple constrained to a subset of the underlying array now throws an error when it encounters a truncated integer. `(Issue #672) `_ * Ruby: Removed ``FDB.init``, ``FDB.create_cluster``, and ``FDB.Cluster``. ``FDB.open`` no longer accepts a ``database_name`` parameter. `(PR #942) `_ * Golang: Deprecated ``fdb.StartNetwork``, ``fdb.Open``, ``fdb.MustOpen``, and ``fdb.CreateCluster`` and added ``fdb.OpenDatabase`` and ``fdb.MustOpenDatabase``. The preferred way to start the network and get a ``Database`` is by using ``FDB.OpenDatabase`` or ``FDB.OpenDefault``. `(PR #942) `_ * Flow: Deprecated ``API::createCluster`` and ``Cluster`` and added ``API::createDatabase``. The preferred way to get a ``Database`` is by using ``API::createDatabase``. `(PR #942) `_ * Golang: Added ``fdb.Printable`` to print a human-readable string for a given byte array. Add ``Key.String()``, which converts the ``Key`` to a ``string`` using the ``Printable`` function. `(PR #1010) `_ +* Golang: Tuples now support ``Versionstamp`` operations. `(PR #1187) `_ * Python: Python signal handling didn't work when waiting on a future. In particular, pressing Ctrl-C would not successfully interrupt the program. `(PR #1138) `_ Other Changes From 40aa2ba6f0cddec4a5be3d8a545e3d8651405008 Mon Sep 17 00:00:00 2001 From: Alec Grieser Date: Thu, 28 Feb 2019 16:30:09 -0800 Subject: [PATCH 30/46] CMakeLists alphabetization and Javadoc improvements --- bindings/java/CMakeLists.txt | 4 +-- .../com/apple/foundationdb/tuple/Tuple.java | 34 ++++++++++++++----- 2 files changed, 28 insertions(+), 10 deletions(-) diff --git a/bindings/java/CMakeLists.txt b/bindings/java/CMakeLists.txt index f8c1c25a65..77a0d5aea0 100644 --- a/bindings/java/CMakeLists.txt +++ b/bindings/java/CMakeLists.txt @@ -54,9 +54,9 @@ set(JAVA_BINDING_SRCS src/main/com/apple/foundationdb/tuple/ByteArrayUtil.java src/main/com/apple/foundationdb/tuple/IterableComparator.java src/main/com/apple/foundationdb/tuple/package-info.java + src/main/com/apple/foundationdb/tuple/StringUtil.java src/main/com/apple/foundationdb/tuple/Tuple.java src/main/com/apple/foundationdb/tuple/TupleUtil.java - src/main/com/apple/foundationdb/tuple/StringUtil.java src/main/com/apple/foundationdb/tuple/Versionstamp.java) set(JAVA_TESTS_SRCS @@ -89,8 +89,8 @@ set(JAVA_TESTS_SRCS src/test/com/apple/foundationdb/test/StackUtils.java src/test/com/apple/foundationdb/test/TesterArgs.java src/test/com/apple/foundationdb/test/TestResult.java - src/test/com/apple/foundationdb/test/TupleTest.java src/test/com/apple/foundationdb/test/TuplePerformanceTest.java + src/test/com/apple/foundationdb/test/TupleTest.java src/test/com/apple/foundationdb/test/VersionstampSmokeTest.java src/test/com/apple/foundationdb/test/WatchTest.java src/test/com/apple/foundationdb/test/WhileTrueTest.java) diff --git a/bindings/java/src/main/com/apple/foundationdb/tuple/Tuple.java b/bindings/java/src/main/com/apple/foundationdb/tuple/Tuple.java index ea47870037..e5556faaa6 100644 --- a/bindings/java/src/main/com/apple/foundationdb/tuple/Tuple.java +++ b/bindings/java/src/main/com/apple/foundationdb/tuple/Tuple.java @@ -315,9 +315,11 @@ public class Tuple implements Comparable, Iterable { /** * Get an encoded representation of this {@code Tuple}. Each element is encoded to - * {@code byte}s and concatenated. + * {@code byte}s and concatenated. Note that once a {@code Tuple} has been packed, its + * serialized representation is stored internally so that future calls to this function + * are faster than the initial call. * - * @return a packed representation of this {@code Tuple}. + * @return a packed representation of this {@code Tuple} */ public byte[] pack() { return packInternal(null, true); @@ -326,10 +328,12 @@ public class Tuple implements Comparable, Iterable { /** * Get an encoded representation of this {@code Tuple}. Each element is encoded to * {@code byte}s and concatenated, and then the prefix supplied is prepended to - * the array. + * the array. Note that once a {@code Tuple} has been packed, its serialized representation + * is stored internally so that future calls to this function are faster than the + * initial call. * - * @param prefix additional byte-array prefix to prepend to packed bytes. - * @return a packed representation of this {@code Tuple} prepended by the {@code prefix}. + * @param prefix additional byte-array prefix to prepend to the packed bytes + * @return a packed representation of this {@code Tuple} prepended by the {@code prefix} */ public byte[] pack(byte[] prefix) { return packInternal(prefix, true); @@ -359,6 +363,9 @@ public class Tuple implements Comparable, Iterable { * It is up to the caller to ensure that there is enough space allocated within the buffer * to avoid {@link java.nio.BufferOverflowException}s. The client may call {@link #getPackedSize()} * to determine how large this {@code Tuple} will be once packed in order to allocate sufficient memory. + * Note that unlike {@link #pack()}, the serialized representation of this {@code Tuple} is not stored, so + * calling this function multiple times with the same {@code Tuple} requires serializing the {@code Tuple} + * multiple times. *
*
* This method will throw an error if there are any incomplete {@link Versionstamp}s in this {@code Tuple}. @@ -402,6 +409,10 @@ public class Tuple implements Comparable, Iterable { * {@link com.apple.foundationdb.Transaction#mutate(com.apple.foundationdb.MutationType, byte[], byte[]) Transaction.mutate()} * with the {@code SET_VERSIONSTAMPED_KEY} {@link com.apple.foundationdb.MutationType}, and the transaction's * version will then be filled in at commit time. + *
+ *
+ * Note that once a {@code Tuple} has been packed, its serialized representation is stored internally so that + * future calls to this function are faster than the initial call. * * @param prefix additional byte-array prefix to prepend to packed bytes. * @return a packed representation of this {@code Tuple} for use with versionstamp ops. @@ -477,11 +488,14 @@ public class Tuple implements Comparable, Iterable { /** * Construct a new {@code Tuple} with elements decoded from a supplied {@code byte} array. - * The passed byte array must not be {@code null}. + * The passed byte array must not be {@code null}. This will throw an exception if the passed byte + * array does not represent a valid {@code Tuple}. For example, this will throw an error if it + * encounters an unknown type code or if there is a packed element that appears to be truncated. * * @param bytes encoded {@code Tuple} source * * @return a new {@code Tuple} constructed by deserializing the provided {@code byte} array + * @throws IllegalArgumentException if {@code bytes} does not represent a valid {@code Tuple} */ public static Tuple fromBytes(byte[] bytes) { return fromBytes(bytes, 0, bytes.length); @@ -489,13 +503,17 @@ public class Tuple implements Comparable, Iterable { /** * Construct a new {@code Tuple} with elements decoded from a supplied {@code byte} array. - * The passed byte array must not be {@code null}. + * The passed byte array must not be {@code null}. This will throw an exception if the specified slice of + * the passed byte array does not represent a valid {@code Tuple}. For example, this will throw an error + * if it encounters an unknown type code or if there is a packed element that appears to be truncated. * * @param bytes encoded {@code Tuple} source * @param offset starting offset of byte array of encoded data * @param length length of encoded data within the source * * @return a new {@code Tuple} constructed by deserializing the specified slice of the provided {@code byte} array + * @throws IllegalArgumentException if {@code offset} or {@code length} are negative or would exceed the size of + * the array or if {@code bytes} does not represent a valid {@code Tuple} */ public static Tuple fromBytes(byte[] bytes, int offset, int length) { if(offset < 0 || offset > bytes.length) { @@ -864,7 +882,7 @@ public class Tuple implements Comparable, Iterable { * the serialized sizes of all of the elements of this {@code Tuple} and does not pack everything * into a single {@code Tuple}. The return value of this function is stored within this {@code Tuple} * after this function has been called so that subsequent calls on the same object are fast. This method - * does not validate that there is no more than one incomplete {@link Versionstamp} in this {@code Tuple}. + * does not validate that there is not more than one incomplete {@link Versionstamp} in this {@code Tuple}. * * @return the number of bytes in the packed representation of this {@code Tuple} */ From 75e475563a65815758f4c81ce8cc593b661bc2da Mon Sep 17 00:00:00 2001 From: Alec Grieser Date: Fri, 1 Mar 2019 16:31:51 -0800 Subject: [PATCH 31/46] clarify comments and be more strict about using UUID_BYTES constant --- .../main/com/apple/foundationdb/tuple/TupleUtil.java | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/bindings/java/src/main/com/apple/foundationdb/tuple/TupleUtil.java b/bindings/java/src/main/com/apple/foundationdb/tuple/TupleUtil.java index 63a1944b5d..e0e43e48df 100644 --- a/bindings/java/src/main/com/apple/foundationdb/tuple/TupleUtil.java +++ b/bindings/java/src/main/com/apple/foundationdb/tuple/TupleUtil.java @@ -40,6 +40,7 @@ class TupleUtil { private static final Charset UTF8 = Charset.forName("UTF-8"); private static final BigInteger LONG_MIN_VALUE = BigInteger.valueOf(Long.MIN_VALUE); private static final BigInteger LONG_MAX_VALUE = BigInteger.valueOf(Long.MAX_VALUE); + private static final int UUID_BYTES = 2 * Long.BYTES; private static final IterableComparator iterableComparator = new IterableComparator(); private static final byte BYTES_CODE = 0x01; @@ -475,10 +476,10 @@ class TupleUtil { state.add(true, start); } else if(code == UUID_CODE) { - ByteBuffer bb = ByteBuffer.wrap(rep, start, 2 * Long.BYTES).order(ByteOrder.BIG_ENDIAN); + ByteBuffer bb = ByteBuffer.wrap(rep, start, UUID_BYTES).order(ByteOrder.BIG_ENDIAN); long msb = bb.getLong(); long lsb = bb.getLong(); - state.add(new UUID(msb, lsb), start + 16); + state.add(new UUID(msb, lsb), start + UUID_BYTES); } else if(code == POS_INT_END) { int n = rep[start] & 0xff; @@ -533,8 +534,8 @@ class TupleUtil { if (val.compareTo(LONG_MIN_VALUE) >= 0 && val.compareTo(LONG_MAX_VALUE) <= 0) { state.add(val.longValue(), end); } else { - // This can occur if the thing can be represented with 8 bytes but not - // the right sign information. + // This can occur if the thing can be represented with 8 bytes but requires using + // the most-significant bit as a normal bit instead of the sign bit. state.add(val, end); } } @@ -745,7 +746,7 @@ class TupleUtil { else if(item instanceof Boolean) packedSize += 1; else if(item instanceof UUID) - packedSize += 1 + 2 * Long.BYTES; + packedSize += 1 + UUID_BYTES; else if(item instanceof BigInteger) { BigInteger bigInt = (BigInteger)item; int byteCount = minimalByteCount(bigInt); From f66ddb13c2f748e07d3136a06cdda0f471b6da05 Mon Sep 17 00:00:00 2001 From: Alec Grieser Date: Fri, 1 Mar 2019 16:54:15 -0800 Subject: [PATCH 32/46] rewrite replace without a buffer to use replace with a buffer to first get length --- .../foundationdb/tuple/ByteArrayUtil.java | 64 +++++++++++-------- .../apple/foundationdb/test/TupleTest.java | 36 +++++++++++ 2 files changed, 73 insertions(+), 27 deletions(-) diff --git a/bindings/java/src/main/com/apple/foundationdb/tuple/ByteArrayUtil.java b/bindings/java/src/main/com/apple/foundationdb/tuple/ByteArrayUtil.java index d848c296ff..83a49051e1 100644 --- a/bindings/java/src/main/com/apple/foundationdb/tuple/ByteArrayUtil.java +++ b/bindings/java/src/main/com/apple/foundationdb/tuple/ByteArrayUtil.java @@ -173,35 +173,31 @@ public class ByteArrayUtil { */ public static byte[] replace(byte[] src, int offset, int length, byte[] pattern, byte[] replacement) { + if(offset < 0 || offset > src.length) { + throw new IllegalArgumentException("Invalid offset for array pattern replacement"); + } + if(length < 0 || offset + length > src.length) { + throw new IllegalArgumentException("Invalid length for array pattern replacement"); + } if(pattern == null || pattern.length == 0) { return Arrays.copyOfRange(src, offset, offset + length); } ByteBuffer dest; if(replacement == null || replacement.length != pattern.length) { // Array might change size. This is the "tricky" case. - byte patternFirst = pattern[0]; - int patternOccurrences = 0; - int currentPosition = offset; - while(currentPosition < offset + length) { - if(src[currentPosition] == patternFirst && regionEquals(src, currentPosition, pattern)) { - patternOccurrences++; - currentPosition += pattern.length; + int newLength = replace(src, offset, length, pattern, replacement, null); + if(newLength != length) { + if(newLength < 0) { + System.out.println("oops"); + newLength = replace(src, offset, length, pattern, replacement, null); } - else { - currentPosition++; - } - } - if(patternOccurrences == 0) { - // Pattern doesn't occur. Just return a copy of the needed region. - return Arrays.copyOfRange(src, offset, offset + length); - } - int replacementLength = (replacement == null) ? 0 : replacement.length; - int newLength = length + patternOccurrences * (replacementLength - pattern.length); - if(newLength == 0) { - return new byte[0]; + dest = ByteBuffer.allocate(newLength); } else { - dest = ByteBuffer.allocate(newLength); + // If the array size didn't change, as the pattern and replacement lengths + // differ, it must be the case that there weren't any occurrences of pattern in src + // between offset and offset + length, so we can just return a copy. + return Arrays.copyOfRange(src, offset, offset + length); } } else { @@ -212,21 +208,30 @@ public class ByteArrayUtil { return dest.array(); } - static void replace(byte[] src, int offset, int length, byte[] pattern, byte[] replacement, ByteBuffer dest) { + // Replace any occurrences of pattern in src between offset and offset + length with replacement. + // The new array is serialized into dest and the new length is returned. + static int replace(byte[] src, int offset, int length, byte[] pattern, byte[] replacement, ByteBuffer dest) { if(pattern == null || pattern.length == 0) { - dest.put(src, offset, length); - return; + if(dest != null) { + dest.put(src, offset, length); + } + return length; } byte patternFirst = pattern[0]; int lastPosition = offset; int currentPosition = offset; + int newLength = 0; + int replacementLength = replacement == null ? 0 : replacement.length; while(currentPosition < offset + length) { if(src[currentPosition] == patternFirst && regionEquals(src, currentPosition, pattern)) { - dest.put(src, lastPosition, currentPosition - lastPosition); - if(replacement != null) { - dest.put(replacement); + if(dest != null) { + dest.put(src, lastPosition, currentPosition - lastPosition); + if(replacement != null) { + dest.put(replacement); + } } + newLength += currentPosition - lastPosition + replacementLength; currentPosition += pattern.length; lastPosition = currentPosition; } @@ -235,7 +240,12 @@ public class ByteArrayUtil { } } - dest.put(src, lastPosition, currentPosition - lastPosition); + newLength += currentPosition - lastPosition; + if(dest != null) { + dest.put(src, lastPosition, currentPosition - lastPosition); + } + + return newLength; } /** diff --git a/bindings/java/src/test/com/apple/foundationdb/test/TupleTest.java b/bindings/java/src/test/com/apple/foundationdb/test/TupleTest.java index ac2b033748..f6152664ec 100644 --- a/bindings/java/src/test/com/apple/foundationdb/test/TupleTest.java +++ b/bindings/java/src/test/com/apple/foundationdb/test/TupleTest.java @@ -916,6 +916,42 @@ public class TupleTest { " with " + ByteArrayUtil.printable(replacement) + " in " + ByteArrayUtil.printable(src)); } } + + try { + ByteArrayUtil.replace(null, 0, 1, new byte[]{0x00}, new byte[]{0x00, FF}); + throw new RuntimeException("able to replace null bytes"); + } + catch(NullPointerException e) { + // eat + } + try { + ByteArrayUtil.replace(new byte[]{0x00, 0x01}, -1, 2, new byte[]{0x00}, new byte[]{0x00, FF}); + throw new RuntimeException("able to use negative offset"); + } + catch(IllegalArgumentException e) { + // eat + } + try { + ByteArrayUtil.replace(new byte[]{0x00, 0x01}, 3, 2, new byte[]{0x00}, new byte[]{0x00, FF}); + throw new RuntimeException("able to use offset after end of array"); + } + catch(IllegalArgumentException e) { + // eat + } + try { + ByteArrayUtil.replace(new byte[]{0x00, 0x01}, 1, -1, new byte[]{0x00}, new byte[]{0x00, FF}); + throw new RuntimeException("able to use negative length"); + } + catch(IllegalArgumentException e) { + // eat + } + try { + ByteArrayUtil.replace(new byte[]{0x00, 0x01}, 1, 2, new byte[]{0x00}, new byte[]{0x00, FF}); + throw new RuntimeException("able to give length that exceeds end of the array"); + } + catch(IllegalArgumentException e) { + // eat + } } private static void runTests(final int reps, TransactionContext db) { From 734029820269a09af1228a6d8572df443aab4a8b Mon Sep 17 00:00:00 2001 From: Alec Grieser Date: Fri, 1 Mar 2019 17:05:48 -0800 Subject: [PATCH 33/46] remove debugging printing that was accidentally added --- .../src/main/com/apple/foundationdb/tuple/ByteArrayUtil.java | 4 ---- 1 file changed, 4 deletions(-) diff --git a/bindings/java/src/main/com/apple/foundationdb/tuple/ByteArrayUtil.java b/bindings/java/src/main/com/apple/foundationdb/tuple/ByteArrayUtil.java index 83a49051e1..fe39fa332e 100644 --- a/bindings/java/src/main/com/apple/foundationdb/tuple/ByteArrayUtil.java +++ b/bindings/java/src/main/com/apple/foundationdb/tuple/ByteArrayUtil.java @@ -187,10 +187,6 @@ public class ByteArrayUtil { // Array might change size. This is the "tricky" case. int newLength = replace(src, offset, length, pattern, replacement, null); if(newLength != length) { - if(newLength < 0) { - System.out.println("oops"); - newLength = replace(src, offset, length, pattern, replacement, null); - } dest = ByteBuffer.allocate(newLength); } else { From d9e9e0c5211dd05990e6147cf0fd6dfcc0fed352 Mon Sep 17 00:00:00 2001 From: Alec Grieser Date: Mon, 11 Mar 2019 18:26:08 -0700 Subject: [PATCH 34/46] use bitwise or instead of addition when reconsituting long --- .../java/src/main/com/apple/foundationdb/tuple/TupleUtil.java | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/bindings/java/src/main/com/apple/foundationdb/tuple/TupleUtil.java b/bindings/java/src/main/com/apple/foundationdb/tuple/TupleUtil.java index e0e43e48df..6ddfae83f9 100644 --- a/bindings/java/src/main/com/apple/foundationdb/tuple/TupleUtil.java +++ b/bindings/java/src/main/com/apple/foundationdb/tuple/TupleUtil.java @@ -509,14 +509,14 @@ class TupleUtil { if(positive && (n < Long.BYTES || rep[start] > 0)) { long res = 0L; for(int i = start; i < end; i++) { - res = (res << 8) + (rep[i] & 0xff); + res = (res << 8) | (rep[i] & 0xff); } state.add(res, end); } else if(!positive && (n < Long.BYTES || rep[start] < 0)) { long res = ~0L; for(int i = start; i < end; i++) { - res = (res << 8) + (rep[i] & 0xff); + res = (res << 8) | (rep[i] & 0xff); } state.add(res + 1, end); } From 2b0139670e413d634425d59b76913c832ae6d1e4 Mon Sep 17 00:00:00 2001 From: Jingyu Zhou Date: Tue, 12 Mar 2019 11:34:16 -0700 Subject: [PATCH 35/46] Fix review comment for PR 1176 --- fdbserver/ClusterController.actor.cpp | 26 ++++++-------- fdbserver/DataDistribution.actor.cpp | 50 +++++++++++++------------- fdbserver/Ratekeeper.actor.cpp | 52 +++++++++++---------------- fdbserver/RatekeeperInterface.h | 2 +- fdbserver/worker.actor.cpp | 4 +++ 5 files changed, 60 insertions(+), 74 deletions(-) diff --git a/fdbserver/ClusterController.actor.cpp b/fdbserver/ClusterController.actor.cpp index c4b0213eeb..55316b5ba7 100644 --- a/fdbserver/ClusterController.actor.cpp +++ b/fdbserver/ClusterController.actor.cpp @@ -2335,12 +2335,10 @@ ACTOR Future handleForcedRecoveries( ClusterControllerData *self, ClusterC } ACTOR Future startDataDistributor( ClusterControllerData *self ) { - state Optional dcId = self->clusterControllerDcId; while ( !self->clusterControllerProcessId.present() || !self->masterProcessId.present() ) { wait( delay(SERVER_KNOBS->WAIT_FOR_GOOD_RECRUITMENT_DELAY) ); } - state UID reqId; loop { try { while ( self->db.serverInfo->get().recoveryState < RecoveryState::ACCEPTING_COMMITS ) { @@ -2348,19 +2346,18 @@ ACTOR Future startDataDistributor( ClusterControllerDa } std::map>, int> id_used = self->getUsedIds(); - state WorkerFitnessInfo data_distributor = self->getWorkerForRoleInDatacenter(dcId, ProcessClass::DataDistributor, ProcessClass::NeverAssign, self->db.config, id_used); - reqId = g_random->randomUniqueID(); - state InitializeDataDistributorRequest req(reqId); - TraceEvent("ClusterController_DataDistributorRecruit", req.reqId).detail("Addr", data_distributor.worker.first.address()); + state WorkerFitnessInfo data_distributor = self->getWorkerForRoleInDatacenter(self->clusterControllerDcId, ProcessClass::DataDistributor, ProcessClass::NeverAssign, self->db.config, id_used); + state InitializeDataDistributorRequest req(g_random->randomUniqueID()); + TraceEvent("ClusterController_DataDistributorRecruit", self->id).detail("Addr", data_distributor.worker.first.address()); ErrorOr distributor = wait( data_distributor.worker.first.dataDistributor.getReplyUnlessFailedFor(req, SERVER_KNOBS->WAIT_FOR_DISTRIBUTOR_JOIN_DELAY, 0) ); if (distributor.present()) { - TraceEvent("ClusterController_DataDistributorRecruited", req.reqId).detail("Addr", data_distributor.worker.first.address()); + TraceEvent("ClusterController_DataDistributorRecruited", self->id).detail("Addr", data_distributor.worker.first.address()); return distributor.get(); } } catch (Error& e) { - TraceEvent("ClusterController_DataDistributorRecruitError", reqId).error(e); + TraceEvent("ClusterController_DataDistributorRecruitError", self->id).error(e); if ( e.code() != error_code_no_more_servers ) { throw; } @@ -2398,7 +2395,6 @@ ACTOR Future monitorDataDistributor(ClusterControllerData *self) { } ACTOR Future startRatekeeper(ClusterControllerData *self) { - state UID reqId; loop { try { while ( self->db.serverInfo->get().recoveryState < RecoveryState::ACCEPTING_COMMITS ) { @@ -2406,20 +2402,18 @@ ACTOR Future startRatekeeper(ClusterControllerData *self) { } std::map>, int> id_used = self->getUsedIds(); - Optional dcId = self->clusterControllerDcId; - state WorkerFitnessInfo rkWorker = self->getWorkerForRoleInDatacenter(dcId, ProcessClass::RateKeeper, ProcessClass::NeverAssign, self->db.config, id_used); - reqId = g_random->randomUniqueID(); - state InitializeRatekeeperRequest req(reqId); - TraceEvent("ClusterController_RecruitRatekeeper", req.reqId).detail("Addr", rkWorker.worker.first.address()); + state WorkerFitnessInfo rkWorker = self->getWorkerForRoleInDatacenter(self->clusterControllerDcId, ProcessClass::RateKeeper, ProcessClass::NeverAssign, self->db.config, id_used); + state InitializeRatekeeperRequest req(g_random->randomUniqueID()); + TraceEvent("ClusterController_RecruitRatekeeper", self->id).detail("Addr", rkWorker.worker.first.address()); ErrorOr interf = wait( rkWorker.worker.first.ratekeeper.getReplyUnlessFailedFor(req, SERVER_KNOBS->WAIT_FOR_RATEKEEPER_JOIN_DELAY, 0) ); if (interf.present()) { - TraceEvent("ClusterController_RatekeeperRecruited", req.reqId).detail("Addr", rkWorker.worker.first.address()); + TraceEvent("ClusterController_RatekeeperRecruited", self->id).detail("Addr", rkWorker.worker.first.address()); return interf.get(); } } catch (Error& e) { - TraceEvent("ClusterController_RatekeeperRecruitError", reqId).error(e); + TraceEvent("ClusterController_RatekeeperRecruitError", self->id).error(e); if ( e.code() != error_code_no_more_servers ) { throw; } diff --git a/fdbserver/DataDistribution.actor.cpp b/fdbserver/DataDistribution.actor.cpp index e6af98ce49..f57e5c6b13 100644 --- a/fdbserver/DataDistribution.actor.cpp +++ b/fdbserver/DataDistribution.actor.cpp @@ -3455,8 +3455,30 @@ struct DataDistributorData : NonCopyable, ReferenceCounted DataDistributorData(Reference> const& db, UID id) : dbInfo(db), ddId(id) {} }; -ACTOR Future dataDistribution(Reference self, double* lastLimited) +ACTOR Future monitorBatchLimitedTime(Reference> db, double* lastLimited) { + loop { + wait( delay(SERVER_KNOBS->METRIC_UPDATE_RATE) ); + + state Reference proxies(new ProxyInfo(db->get().client.proxies, db->get().myLocality)); + + choose { + when (wait(db->onChange())) {} + when (GetHealthMetricsReply reply = wait(proxies->size() ? + loadBalance(proxies, &MasterProxyInterface::getHealthMetrics, GetHealthMetricsRequest(false)) + : Never())) { + if (reply.healthMetrics.batchLimited) { + *lastLimited = now(); + } + } + } + } +} + +ACTOR Future dataDistribution(Reference self) { + state double lastLimited = 0; + self->addActor.send( monitorBatchLimitedTime(self->dbInfo, &lastLimited) ); + state Database cx = openDBOnServer(self->dbInfo, TaskDataDistributionLaunch, true, true); cx->locationCacheSize = SERVER_KNOBS->DD_LOCATION_CACHE_SIZE; @@ -3612,7 +3634,7 @@ ACTOR Future dataDistribution(Reference self, double* actors.push_back( pollMoveKeysLock(cx, lock) ); actors.push_back( reportErrorsExcept( dataDistributionTracker( initData, cx, output, shardsAffectedByTeamFailure, getShardMetrics, getAverageShardBytes.getFuture(), readyToStart, anyZeroHealthyTeams, self->ddId ), "DDTracker", self->ddId, &normalDDQueueErrors() ) ); - actors.push_back( reportErrorsExcept( dataDistributionQueue( cx, output, input.getFuture(), getShardMetrics, processingUnhealthy, tcis, shardsAffectedByTeamFailure, lock, getAverageShardBytes, self->ddId, storageTeamSize, lastLimited ), "DDQueue", self->ddId, &normalDDQueueErrors() ) ); + actors.push_back( reportErrorsExcept( dataDistributionQueue( cx, output, input.getFuture(), getShardMetrics, processingUnhealthy, tcis, shardsAffectedByTeamFailure, lock, getAverageShardBytes, self->ddId, storageTeamSize, &lastLimited ), "DDQueue", self->ddId, &normalDDQueueErrors() ) ); vector teamCollectionsPtrs; Reference primaryTeamCollection( new DDTeamCollection(cx, self->ddId, lock, output, shardsAffectedByTeamFailure, configuration, primaryDcId, configuration.usableRegions > 1 ? remoteDcIds : std::vector>(), readyToStart.getFuture(), zeroHealthyTeams[0], true, processingUnhealthy) ); @@ -3654,36 +3676,14 @@ static std::set const& normalDataDistributorErrors() { return s; } -ACTOR Future monitorBatchLimitedTime(Reference> db, double* lastLimited) { - loop { - wait( delay(SERVER_KNOBS->METRIC_UPDATE_RATE) ); - while (db->get().client.proxies.size() == 0) { - wait(db->onChange()); - } - - state int idx = g_random->randomInt(0, db->get().client.proxies.size()); - choose { - when (wait(db->onChange())) {} - when (ErrorOr reply = wait( - db->get().client.proxies[idx].getHealthMetrics.getReplyUnlessFailedFor(GetHealthMetricsRequest(false), 1.0, 0))) { - if (reply.present() && reply.get().healthMetrics.batchLimited) { - *lastLimited = now(); - } - } - } - } -} - ACTOR Future dataDistributor(DataDistributorInterface di, Reference> db ) { state Reference self( new DataDistributorData(db, di.id()) ); state Future collection = actorCollection( self->addActor.getFuture() ); - state double lastLimited = 0; try { TraceEvent("DataDistributor_Running", di.id()); self->addActor.send( waitFailureServer(di.waitFailure.getFuture()) ); - self->addActor.send( monitorBatchLimitedTime(db, &lastLimited) ); - state Future distributor = reportErrorsExcept( dataDistribution(self, &lastLimited), "DataDistribution", di.id(), &normalDataDistributorErrors() ); + state Future distributor = reportErrorsExcept( dataDistribution(self), "DataDistribution", di.id(), &normalDataDistributorErrors() ); wait( distributor || collection ); } diff --git a/fdbserver/Ratekeeper.actor.cpp b/fdbserver/Ratekeeper.actor.cpp index 46dcfe25f0..a89271f0c0 100644 --- a/fdbserver/Ratekeeper.actor.cpp +++ b/fdbserver/Ratekeeper.actor.cpp @@ -294,50 +294,38 @@ ACTOR Future monitorServerListChange( Reference> dbInfo, PromiseStream< std::pair> > serverChanges) { state Database db = openDBOnServer(dbInfo, TaskRateKeeper, true, true); - state Future checkSignal = delay(SERVER_KNOBS->SERVER_LIST_DELAY); - state Future>> serverListAndProcessClasses = Never(); state std::map oldServers; state Transaction tr(db); loop { try { - choose { - when ( wait( checkSignal ) ) { - checkSignal = Never(); - serverListAndProcessClasses = getServerListAndProcessClasses(&tr); - } - when ( vector> results = wait( serverListAndProcessClasses ) ) { - serverListAndProcessClasses = Never(); + vector> results = wait(getServerListAndProcessClasses(&tr)); - std::map newServers; - for (int i = 0; i < results.size(); i++) { - const StorageServerInterface& ssi = results[i].first; - const UID serverId = ssi.id(); - newServers[serverId] = ssi; + std::map newServers; + for (int i = 0; i < results.size(); i++) { + const StorageServerInterface& ssi = results[i].first; + const UID serverId = ssi.id(); + newServers[serverId] = ssi; - if (oldServers.count(serverId)) { - if (ssi.getValue.getEndpoint() != oldServers[serverId].getValue.getEndpoint()) { - serverChanges.send( std::make_pair(serverId, Optional(ssi)) ); - } - oldServers.erase(serverId); - } else { - serverChanges.send( std::make_pair(serverId, Optional(ssi)) ); - } + if (oldServers.count(serverId)) { + if (ssi.getValue.getEndpoint() != oldServers[serverId].getValue.getEndpoint()) { + serverChanges.send( std::make_pair(serverId, Optional(ssi)) ); } - - for (const auto& it : oldServers) { - serverChanges.send( std::make_pair(it.first, Optional()) ); - } - - oldServers.swap(newServers); - tr = Transaction(db); - checkSignal = delay(SERVER_KNOBS->SERVER_LIST_DELAY); + oldServers.erase(serverId); + } else { + serverChanges.send( std::make_pair(serverId, Optional(ssi)) ); } } + + for (const auto& it : oldServers) { + serverChanges.send( std::make_pair(it.first, Optional()) ); + } + + oldServers.swap(newServers); + tr = Transaction(db); + wait(delay(SERVER_KNOBS->SERVER_LIST_DELAY)); } catch(Error& e) { wait( tr.onError(e) ); - serverListAndProcessClasses = Never(); - checkSignal = Void(); } } } diff --git a/fdbserver/RatekeeperInterface.h b/fdbserver/RatekeeperInterface.h index c50447d544..cd8ffeb126 100644 --- a/fdbserver/RatekeeperInterface.h +++ b/fdbserver/RatekeeperInterface.h @@ -1,5 +1,5 @@ /* - * DataDistributorInterface.h + * RatekeeperInterface.h * * This source file is part of the FoundationDB open source project * diff --git a/fdbserver/worker.actor.cpp b/fdbserver/worker.actor.cpp index 88d8ec85d0..2479adbff5 100644 --- a/fdbserver/worker.actor.cpp +++ b/fdbserver/worker.actor.cpp @@ -832,6 +832,7 @@ ACTOR Future workerServer( Reference connFile, Refe TEST(true); // Recruited while already a data distributor. } else { startRole( Role::DATA_DISTRIBUTOR, recruited.id(), interf.id() ); + DUMPTOKEN( recruited.waitFailure ); Future dataDistributorProcess = dataDistributor( recruited, dbInfo ); errorForwarders.add( forwardError( errors, Role::DATA_DISTRIBUTOR, recruited.id(), setWhenDoneOrError( dataDistributorProcess, ddInterf, Optional() ) ) ); @@ -849,6 +850,9 @@ ACTOR Future workerServer( Reference connFile, Refe TEST(true); // Recruited while already a ratekeeper. } else { startRole(Role::RATE_KEEPER, recruited.id(), interf.id()); + DUMPTOKEN( recruited.waitFailure ); + DUMPTOKEN( recruited.getRateInfo ); + Future ratekeeper = rateKeeper( recruited, dbInfo ); errorForwarders.add( forwardError( errors, Role::RATE_KEEPER, recruited.id(), setWhenDoneOrError( ratekeeper, rkInterf, Optional() ) ) ); rkInterf->set(Optional(recruited)); From 5392742902aec2450f9412f59639d2f8731e1459 Mon Sep 17 00:00:00 2001 From: Evan Tschannen Date: Tue, 12 Mar 2019 14:38:54 -0700 Subject: [PATCH 36/46] fixed review comments --- fdbserver/ClusterController.actor.cpp | 12 ++++++------ flow/genericactors.actor.h | 2 ++ 2 files changed, 8 insertions(+), 6 deletions(-) diff --git a/fdbserver/ClusterController.actor.cpp b/fdbserver/ClusterController.actor.cpp index da3f15a602..b64330f950 100644 --- a/fdbserver/ClusterController.actor.cpp +++ b/fdbserver/ClusterController.actor.cpp @@ -290,13 +290,13 @@ public: auto fitnessEnum = (ProcessClass::Fitness) fitness; for(int addingDegraded = 0; addingDegraded < 2; addingDegraded++) { auto workerItr = fitness_workers.find(std::make_pair(fitnessEnum,(bool)addingDegraded)); - if (workerItr == fitness_workers.end()) { - continue; + if (workerItr != fitness_workers.end()) { + for (auto& worker : workerItr->second ) { + logServerMap->add(worker.interf.locality, &worker); + } } - for (auto& worker : workerItr->second ) { - logServerMap->add(worker.interf.locality, &worker); - } - if (logServerSet->size() < required) { + + if (logServerSet->size() < (addingDegraded == 0 ? desired : required)) { TraceEvent(SevWarn,"GWFTADTooFew", id).detail("Fitness", fitness).detail("Processes", logServerSet->size()).detail("Required", required).detail("TLogPolicy", policy->info()).detail("DesiredLogs", desired).detail("AddingDegraded", addingDegraded); } else if (logServerSet->size() == required || logServerSet->size() <= desired) { diff --git a/flow/genericactors.actor.h b/flow/genericactors.actor.h index 8570553341..8e419a8349 100644 --- a/flow/genericactors.actor.h +++ b/flow/genericactors.actor.h @@ -783,6 +783,8 @@ Future resetAfter( Reference> var, double time, T val ) { choose { when( wait( resetDelay ) ) { var->set( val ); + isEqual = true; + resetDelay = Never(); } when( wait( var->onChange() ) ) {} } From 931788150aee09ec0130bd04edec253eb757c640 Mon Sep 17 00:00:00 2001 From: Meng Xu Date: Tue, 12 Mar 2019 15:56:15 -0700 Subject: [PATCH 37/46] Add release note for teamRemover PR --- documentation/sphinx/source/release-notes.rst | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/documentation/sphinx/source/release-notes.rst b/documentation/sphinx/source/release-notes.rst index 81cf8b0aca..bcabdd8b59 100644 --- a/documentation/sphinx/source/release-notes.rst +++ b/documentation/sphinx/source/release-notes.rst @@ -7,7 +7,9 @@ Release Notes Features -------- -* Improved replication mechanism, a new hierarchical replication technique that further significantly reduces the frequency of data loss events even when multiple machines (e.g., fault-tolerant zones in the current code) permanently fail at the same time. `(PR #964) `. +* Improved replication mechanism, a new hierarchical replication technique that further significantly reduces the frequency of data loss events even when multiple machines (e.g., fault-tolerant zones in the current code) permanently fail at the same time. `(PR #964) `_. + +* Added background actor to remove redundant teams from team collection so that the healthy team number is guaranteed not exceeding the desired number. `(PR #1139) `_ * Get read version, read, and commit requests are counted and aggregated by server-side latency in configurable latency bands and output in JSON status. `(PR #1084) `_ From 5e552a47b6daffe736ee04a5e1538c8ff6ba4ede Mon Sep 17 00:00:00 2001 From: Vishesh Yadav Date: Tue, 12 Mar 2019 15:33:32 -0700 Subject: [PATCH 38/46] doc: Updated release notes for 6.1 --- documentation/sphinx/source/release-notes.rst | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/documentation/sphinx/source/release-notes.rst b/documentation/sphinx/source/release-notes.rst index e29d30425c..efa6c17c64 100644 --- a/documentation/sphinx/source/release-notes.rst +++ b/documentation/sphinx/source/release-notes.rst @@ -20,6 +20,9 @@ Features * Restore target version can now be specified by timestamp if the original cluster is available. `(PR #1240) `_ * Separate data distribution out from master as a new role. `(PR #1062) `_ * Separate rate keeper out from data distribution as a new role. `(PR ##1176) `_ +* Added a new atomic op `CompareAndClear`. `(PR #1105) `_ +* Added support for IPv6. `(PR #1176) https://github.com/apple/foundationdb/pull/1178`_ +* FDB can now simultaneously listen to TLS and unencrypted ports to facilitate smoother migration to TLS. `(PR #1157) https://github.com/apple/foundationdb/pull/1157`_ Performance ----------- @@ -30,6 +33,7 @@ Fixes ----- * Python: Creating a ``SingleFloat`` for the tuple layer didn't work with integers. `(PR #1216) `_ +* Added `USE_EIO_FILE` knob to fallback to libeio instead of kernel async I/O (KAIO) for systems that do not support KAIO or O_DIRECT flag. `(PR #1283) https://github.com/apple/foundationdb/pull/1283`_ Status ------ @@ -58,6 +62,8 @@ Bindings Other Changes ------------- +* Migrated to Boost 1.67. `(PR #1242) https://github.com/apple/foundationdb/pull/1242`_ + Earlier release notes --------------------- * :doc:`6.0 (API Version 600) ` From ff8bac8d208f94d1fb389ff3a4a022bdf47cf319 Mon Sep 17 00:00:00 2001 From: Vishesh Yadav Date: Tue, 12 Mar 2019 17:58:55 -0700 Subject: [PATCH 39/46] doc: Some documentation for IPv6 --- documentation/sphinx/source/administration.rst | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/documentation/sphinx/source/administration.rst b/documentation/sphinx/source/administration.rst index 8ddc88ac6f..1e0111b584 100644 --- a/documentation/sphinx/source/administration.rst +++ b/documentation/sphinx/source/administration.rst @@ -141,6 +141,21 @@ Any client connected to FoundationDB can access information about its cluster fi * To get the path to the cluster file, read the key ``\xFF\xFF/cluster_file_path``. * To get the contents of the cluster file, read the key ``\xFF\xFF/connection_string``. +.. _ipv6-support: + +IPv6 Support +============ + +FoundationDB (since v6.1) can accept network connections from clients connecting over IPv6. IPv6 address/port pair is represented as ``[IP]:PORT``, e.g. "[::1]:4800", "[abcd::dead:beef]:4500". + +1) The cluster file can contain mix of IPv6 and IPv6 addresses. For example:: + + description:ID@127.0.0.1:4500,[::1]:4500,... + +2) Starting ``fdbserver`` with IPv6:: + + $ /path/to/fdbserver -C fdb.cluster -p \[::1\]:4500 + .. _adding-machines-to-a-cluster: Adding machines to a cluster From a2108047aa3eb2d9718afe6ba74b9a6228c5f072 Mon Sep 17 00:00:00 2001 From: Evan Tschannen Date: Wed, 13 Mar 2019 13:14:39 -0700 Subject: [PATCH 40/46] removed LocalitySetRef and IRepPolicyRef typedefs, because for clarity the Ref suffix is reserved for arena allocated objects instead of reference counted objects. --- fdbclient/DatabaseConfiguration.cpp | 28 ++--- fdbclient/DatabaseConfiguration.h | 12 +-- fdbclient/ManagementAPI.actor.cpp | 44 ++++---- fdbrpc/Replication.h | 32 +++--- fdbrpc/ReplicationPolicy.cpp | 36 +++---- fdbrpc/ReplicationPolicy.h | 64 ++++++------ fdbrpc/ReplicationTypes.h | 3 - fdbrpc/ReplicationUtils.cpp | 110 ++++++++++---------- fdbrpc/ReplicationUtils.h | 20 ++-- fdbrpc/simulator.h | 10 +- fdbserver/ClusterController.actor.cpp | 8 +- fdbserver/DBCoreState.h | 2 +- fdbserver/DataDistribution.actor.cpp | 16 +-- fdbserver/LogSystem.h | 10 +- fdbserver/LogSystemConfig.h | 2 +- fdbserver/LogSystemPeekCursor.actor.cpp | 2 +- fdbserver/TagPartitionedLogSystem.actor.cpp | 8 +- fdbserver/WorkerInterface.actor.h | 2 +- 18 files changed, 203 insertions(+), 206 deletions(-) diff --git a/fdbclient/DatabaseConfiguration.cpp b/fdbclient/DatabaseConfiguration.cpp index 0af3402b73..1bc518e0e4 100644 --- a/fdbclient/DatabaseConfiguration.cpp +++ b/fdbclient/DatabaseConfiguration.cpp @@ -38,7 +38,7 @@ void DatabaseConfiguration::resetInternal() { autoDesiredTLogCount = CLIENT_KNOBS->DEFAULT_AUTO_LOGS; usableRegions = 1; regions.clear(); - tLogPolicy = storagePolicy = remoteTLogPolicy = IRepPolicyRef(); + tLogPolicy = storagePolicy = remoteTLogPolicy = Reference(); remoteDesiredTLogCount = -1; remoteTLogReplicationFactor = repopulateRegionAntiQuorum = 0; } @@ -48,7 +48,7 @@ void parse( int* i, ValueRef const& v ) { *i = atoi(v.toString().c_str()); } -void parseReplicationPolicy(IRepPolicyRef* policy, ValueRef const& v) { +void parseReplicationPolicy(Reference* policy, ValueRef const& v) { BinaryReader reader(v, IncludeVersion()); serializeReplicationPolicy(reader, *policy); } @@ -91,35 +91,35 @@ void parse( std::vector* regions, ValueRef const& v ) { info.satelliteTLogReplicationFactor = 1; info.satelliteTLogUsableDcs = 1; info.satelliteTLogWriteAntiQuorum = 0; - info.satelliteTLogPolicy = IRepPolicyRef(new PolicyOne()); + info.satelliteTLogPolicy = Reference(new PolicyOne()); } else if(satelliteReplication == "one_satellite_double") { info.satelliteTLogReplicationFactor = 2; info.satelliteTLogUsableDcs = 1; info.satelliteTLogWriteAntiQuorum = 0; - info.satelliteTLogPolicy = IRepPolicyRef(new PolicyAcross(2, "zoneid", IRepPolicyRef(new PolicyOne()))); + info.satelliteTLogPolicy = Reference(new PolicyAcross(2, "zoneid", Reference(new PolicyOne()))); } else if(satelliteReplication == "one_satellite_triple") { info.satelliteTLogReplicationFactor = 3; info.satelliteTLogUsableDcs = 1; info.satelliteTLogWriteAntiQuorum = 0; - info.satelliteTLogPolicy = IRepPolicyRef(new PolicyAcross(3, "zoneid", IRepPolicyRef(new PolicyOne()))); + info.satelliteTLogPolicy = Reference(new PolicyAcross(3, "zoneid", Reference(new PolicyOne()))); } else if(satelliteReplication == "two_satellite_safe") { info.satelliteTLogReplicationFactor = 4; info.satelliteTLogUsableDcs = 2; info.satelliteTLogWriteAntiQuorum = 0; - info.satelliteTLogPolicy = IRepPolicyRef(new PolicyAcross(2, "dcid", IRepPolicyRef(new PolicyAcross(2, "zoneid", IRepPolicyRef(new PolicyOne()))))); + info.satelliteTLogPolicy = Reference(new PolicyAcross(2, "dcid", Reference(new PolicyAcross(2, "zoneid", Reference(new PolicyOne()))))); info.satelliteTLogReplicationFactorFallback = 2; info.satelliteTLogUsableDcsFallback = 1; info.satelliteTLogWriteAntiQuorumFallback = 0; - info.satelliteTLogPolicyFallback = IRepPolicyRef(new PolicyAcross(2, "zoneid", IRepPolicyRef(new PolicyOne()))); + info.satelliteTLogPolicyFallback = Reference(new PolicyAcross(2, "zoneid", Reference(new PolicyOne()))); } else if(satelliteReplication == "two_satellite_fast") { info.satelliteTLogReplicationFactor = 4; info.satelliteTLogUsableDcs = 2; info.satelliteTLogWriteAntiQuorum = 2; - info.satelliteTLogPolicy = IRepPolicyRef(new PolicyAcross(2, "dcid", IRepPolicyRef(new PolicyAcross(2, "zoneid", IRepPolicyRef(new PolicyOne()))))); + info.satelliteTLogPolicy = Reference(new PolicyAcross(2, "dcid", Reference(new PolicyAcross(2, "zoneid", Reference(new PolicyOne()))))); info.satelliteTLogReplicationFactorFallback = 2; info.satelliteTLogUsableDcsFallback = 1; info.satelliteTLogWriteAntiQuorumFallback = 0; - info.satelliteTLogPolicyFallback = IRepPolicyRef(new PolicyAcross(2, "zoneid", IRepPolicyRef(new PolicyOne()))); + info.satelliteTLogPolicyFallback = Reference(new PolicyAcross(2, "zoneid", Reference(new PolicyOne()))); } else { throw invalid_option(); } @@ -141,20 +141,20 @@ void parse( std::vector* regions, ValueRef const& v ) { void DatabaseConfiguration::setDefaultReplicationPolicy() { if(!storagePolicy) { - storagePolicy = IRepPolicyRef(new PolicyAcross(storageTeamSize, "zoneid", IRepPolicyRef(new PolicyOne()))); + storagePolicy = Reference(new PolicyAcross(storageTeamSize, "zoneid", Reference(new PolicyOne()))); } if(!tLogPolicy) { - tLogPolicy = IRepPolicyRef(new PolicyAcross(tLogReplicationFactor, "zoneid", IRepPolicyRef(new PolicyOne()))); + tLogPolicy = Reference(new PolicyAcross(tLogReplicationFactor, "zoneid", Reference(new PolicyOne()))); } if(remoteTLogReplicationFactor > 0 && !remoteTLogPolicy) { - remoteTLogPolicy = IRepPolicyRef(new PolicyAcross(remoteTLogReplicationFactor, "zoneid", IRepPolicyRef(new PolicyOne()))); + remoteTLogPolicy = Reference(new PolicyAcross(remoteTLogReplicationFactor, "zoneid", Reference(new PolicyOne()))); } for(auto& r : regions) { if(r.satelliteTLogReplicationFactor > 0 && !r.satelliteTLogPolicy) { - r.satelliteTLogPolicy = IRepPolicyRef(new PolicyAcross(r.satelliteTLogReplicationFactor, "zoneid", IRepPolicyRef(new PolicyOne()))); + r.satelliteTLogPolicy = Reference(new PolicyAcross(r.satelliteTLogReplicationFactor, "zoneid", Reference(new PolicyOne()))); } if(r.satelliteTLogReplicationFactorFallback > 0 && !r.satelliteTLogPolicyFallback) { - r.satelliteTLogPolicyFallback = IRepPolicyRef(new PolicyAcross(r.satelliteTLogReplicationFactorFallback, "zoneid", IRepPolicyRef(new PolicyOne()))); + r.satelliteTLogPolicyFallback = Reference(new PolicyAcross(r.satelliteTLogReplicationFactorFallback, "zoneid", Reference(new PolicyOne()))); } } } diff --git a/fdbclient/DatabaseConfiguration.h b/fdbclient/DatabaseConfiguration.h index 5df38f1fb2..18bf0b0352 100644 --- a/fdbclient/DatabaseConfiguration.h +++ b/fdbclient/DatabaseConfiguration.h @@ -49,13 +49,13 @@ struct RegionInfo { Key dcId; int32_t priority; - IRepPolicyRef satelliteTLogPolicy; + Reference satelliteTLogPolicy; int32_t satelliteDesiredTLogCount; int32_t satelliteTLogReplicationFactor; int32_t satelliteTLogWriteAntiQuorum; int32_t satelliteTLogUsableDcs; - IRepPolicyRef satelliteTLogPolicyFallback; + Reference satelliteTLogPolicyFallback; int32_t satelliteTLogReplicationFactorFallback; int32_t satelliteTLogWriteAntiQuorumFallback; int32_t satelliteTLogUsableDcsFallback; @@ -157,7 +157,7 @@ struct DatabaseConfiguration { int32_t autoResolverCount; // TLogs - IRepPolicyRef tLogPolicy; + Reference tLogPolicy; int32_t desiredTLogCount; int32_t autoDesiredTLogCount; int32_t tLogWriteAntiQuorum; @@ -167,7 +167,7 @@ struct DatabaseConfiguration { TLogSpillType tLogSpillType; // Storage Servers - IRepPolicyRef storagePolicy; + Reference storagePolicy; int32_t storageTeamSize; KeyValueStoreType storageServerStoreType; @@ -175,7 +175,7 @@ struct DatabaseConfiguration { int32_t desiredLogRouterCount; int32_t remoteDesiredTLogCount; int32_t remoteTLogReplicationFactor; - IRepPolicyRef remoteTLogPolicy; + Reference remoteTLogPolicy; //Data centers int32_t usableRegions; @@ -195,7 +195,7 @@ struct DatabaseConfiguration { if(desired == -1) return autoDesiredTLogCount; return desired; } int32_t getRemoteTLogReplicationFactor() const { if(remoteTLogReplicationFactor == 0) return tLogReplicationFactor; return remoteTLogReplicationFactor; } - IRepPolicyRef getRemoteTLogPolicy() const { if(remoteTLogReplicationFactor == 0) return tLogPolicy; return remoteTLogPolicy; } + Reference getRemoteTLogPolicy() const { if(remoteTLogReplicationFactor == 0) return tLogPolicy; return remoteTLogPolicy; } bool operator == ( DatabaseConfiguration const& rhs ) const { const_cast(this)->makeConfigurationImmutable(); diff --git a/fdbclient/ManagementAPI.actor.cpp b/fdbclient/ManagementAPI.actor.cpp index fa54b5b391..04cbbeb45e 100644 --- a/fdbclient/ManagementAPI.actor.cpp +++ b/fdbclient/ManagementAPI.actor.cpp @@ -99,42 +99,42 @@ std::map configForToken( std::string const& mode ) { } std::string redundancy, log_replicas; - IRepPolicyRef storagePolicy; - IRepPolicyRef tLogPolicy; + Reference storagePolicy; + Reference tLogPolicy; bool redundancySpecified = true; if (mode == "single") { redundancy="1"; log_replicas="1"; - storagePolicy = tLogPolicy = IRepPolicyRef(new PolicyOne()); + storagePolicy = tLogPolicy = Reference(new PolicyOne()); } else if(mode == "double" || mode == "fast_recovery_double") { redundancy="2"; log_replicas="2"; - storagePolicy = tLogPolicy = IRepPolicyRef(new PolicyAcross(2, "zoneid", IRepPolicyRef(new PolicyOne()))); + storagePolicy = tLogPolicy = Reference(new PolicyAcross(2, "zoneid", Reference(new PolicyOne()))); } else if(mode == "triple" || mode == "fast_recovery_triple") { redundancy="3"; log_replicas="3"; - storagePolicy = tLogPolicy = IRepPolicyRef(new PolicyAcross(3, "zoneid", IRepPolicyRef(new PolicyOne()))); + storagePolicy = tLogPolicy = Reference(new PolicyAcross(3, "zoneid", Reference(new PolicyOne()))); } else if(mode == "three_datacenter" || mode == "multi_dc") { redundancy="6"; log_replicas="4"; - storagePolicy = IRepPolicyRef(new PolicyAcross(3, "dcid", - IRepPolicyRef(new PolicyAcross(2, "zoneid", IRepPolicyRef(new PolicyOne()))) + storagePolicy = Reference(new PolicyAcross(3, "dcid", + Reference(new PolicyAcross(2, "zoneid", Reference(new PolicyOne()))) )); - tLogPolicy = IRepPolicyRef(new PolicyAcross(2, "dcid", - IRepPolicyRef(new PolicyAcross(2, "zoneid", IRepPolicyRef(new PolicyOne()))) + tLogPolicy = Reference(new PolicyAcross(2, "dcid", + Reference(new PolicyAcross(2, "zoneid", Reference(new PolicyOne()))) )); } else if(mode == "three_datacenter_fallback") { redundancy="4"; log_replicas="4"; - storagePolicy = tLogPolicy = IRepPolicyRef(new PolicyAcross(2, "dcid", IRepPolicyRef(new PolicyAcross(2, "zoneid", IRepPolicyRef(new PolicyOne()))))); + storagePolicy = tLogPolicy = Reference(new PolicyAcross(2, "dcid", Reference(new PolicyAcross(2, "zoneid", Reference(new PolicyOne()))))); } else if(mode == "three_data_hall") { redundancy="3"; log_replicas="4"; - storagePolicy = IRepPolicyRef(new PolicyAcross(3, "data_hall", IRepPolicyRef(new PolicyOne()))); - tLogPolicy = IRepPolicyRef(new PolicyAcross(2, "data_hall", - IRepPolicyRef(new PolicyAcross(2, "zoneid", IRepPolicyRef(new PolicyOne()))) + storagePolicy = Reference(new PolicyAcross(3, "data_hall", Reference(new PolicyOne()))); + tLogPolicy = Reference(new PolicyAcross(2, "data_hall", + Reference(new PolicyAcross(2, "zoneid", Reference(new PolicyOne()))) )); } else redundancySpecified = false; @@ -154,29 +154,29 @@ std::map configForToken( std::string const& mode ) { } std::string remote_redundancy, remote_log_replicas; - IRepPolicyRef remoteTLogPolicy; + Reference remoteTLogPolicy; bool remoteRedundancySpecified = true; if (mode == "remote_default") { remote_redundancy="0"; remote_log_replicas="0"; - remoteTLogPolicy = IRepPolicyRef(); + remoteTLogPolicy = Reference(); } else if (mode == "remote_single") { remote_redundancy="1"; remote_log_replicas="1"; - remoteTLogPolicy = IRepPolicyRef(new PolicyOne()); + remoteTLogPolicy = Reference(new PolicyOne()); } else if(mode == "remote_double") { remote_redundancy="2"; remote_log_replicas="2"; - remoteTLogPolicy = IRepPolicyRef(new PolicyAcross(2, "zoneid", IRepPolicyRef(new PolicyOne()))); + remoteTLogPolicy = Reference(new PolicyAcross(2, "zoneid", Reference(new PolicyOne()))); } else if(mode == "remote_triple") { remote_redundancy="3"; remote_log_replicas="3"; - remoteTLogPolicy = IRepPolicyRef(new PolicyAcross(3, "zoneid", IRepPolicyRef(new PolicyOne()))); + remoteTLogPolicy = Reference(new PolicyAcross(3, "zoneid", Reference(new PolicyOne()))); } else if(mode == "remote_three_data_hall") { //FIXME: not tested in simulation remote_redundancy="3"; remote_log_replicas="4"; - remoteTLogPolicy = IRepPolicyRef(new PolicyAcross(2, "data_hall", - IRepPolicyRef(new PolicyAcross(2, "zoneid", IRepPolicyRef(new PolicyOne()))) + remoteTLogPolicy = Reference(new PolicyAcross(2, "data_hall", + Reference(new PolicyAcross(2, "zoneid", Reference(new PolicyOne()))) )); } else remoteRedundancySpecified = false; @@ -212,7 +212,7 @@ ConfigurationResult::Type buildConfiguration( std::vector const& mode auto p = configKeysPrefix.toString(); if(!outConf.count(p + "storage_replication_policy") && outConf.count(p + "storage_replicas")) { int storageCount = stoi(outConf[p + "storage_replicas"]); - IRepPolicyRef storagePolicy = IRepPolicyRef(new PolicyAcross(storageCount, "zoneid", IRepPolicyRef(new PolicyOne()))); + Reference storagePolicy = Reference(new PolicyAcross(storageCount, "zoneid", Reference(new PolicyOne()))); BinaryWriter policyWriter(IncludeVersion()); serializeReplicationPolicy(policyWriter, storagePolicy); outConf[p+"storage_replication_policy"] = policyWriter.toStringRef().toString(); @@ -220,7 +220,7 @@ ConfigurationResult::Type buildConfiguration( std::vector const& mode if(!outConf.count(p + "log_replication_policy") && outConf.count(p + "log_replicas")) { int logCount = stoi(outConf[p + "log_replicas"]); - IRepPolicyRef logPolicy = IRepPolicyRef(new PolicyAcross(logCount, "zoneid", IRepPolicyRef(new PolicyOne()))); + Reference logPolicy = Reference(new PolicyAcross(logCount, "zoneid", Reference(new PolicyOne()))); BinaryWriter policyWriter(IncludeVersion()); serializeReplicationPolicy(policyWriter, logPolicy); outConf[p+"log_replication_policy"] = policyWriter.toStringRef().toString(); diff --git a/fdbrpc/Replication.h b/fdbrpc/Replication.h index 828ca1fd42..e8e32b79fa 100644 --- a/fdbrpc/Replication.h +++ b/fdbrpc/Replication.h @@ -36,23 +36,23 @@ public: virtual void delref() { ReferenceCounted::delref(); } bool selectReplicas( - IRepPolicyRef const& policy, + Reference const& policy, std::vector const& alsoServers, std::vector & results) { - LocalitySetRef fromServers = LocalitySetRef::addRef(this); + Reference fromServers = Reference::addRef(this); return policy->selectReplicas(fromServers, alsoServers, results); } bool selectReplicas( - IRepPolicyRef const& policy, + Reference const& policy, std::vector & results) { return selectReplicas(policy, std::vector(), results); } bool validate( - IRepPolicyRef const& policy) const + Reference const& policy) const { - LocalitySetRef const solutionSet = LocalitySetRef::addRef((LocalitySet*) this); + Reference const solutionSet = Reference::addRef((LocalitySet*) this); return policy->validate(solutionSet); } @@ -159,7 +159,7 @@ public: } static void staticDisplayEntries( - LocalitySetRef const& fromServers, + Reference const& fromServers, std::vector const& entryArray, const char* name = "zone") { @@ -174,8 +174,8 @@ public: // the specified value for the given key // The returned LocalitySet contains the LocalityRecords that have the same value as // the indexValue under the same indexKey (e.g., zoneid) - LocalitySetRef restrict(AttribKey indexKey, AttribValue indexValue ) { - LocalitySetRef localitySet; + Reference restrict(AttribKey indexKey, AttribValue indexValue ) { + Reference localitySet; LocalityCacheRecord searchRecord(AttribRecord(indexKey, indexValue), localitySet); auto itKeyValue = std::lower_bound(_cacheArray.begin(), _cacheArray.end(), searchRecord, LocalityCacheRecord::compareKeyValue); @@ -185,7 +185,7 @@ public: localitySet = itKeyValue->_resultset; } else { - localitySet = LocalitySetRef(new LocalitySet(*_localitygroup)); + localitySet = Reference(new LocalitySet(*_localitygroup)); _cachemisses ++; // If the key is not within the current key set, skip it because no items within // the current entry array has the key @@ -213,8 +213,8 @@ public: } // This function is used to create an subset containing the specified entries - LocalitySetRef restrict(std::vector const& entryArray) { - LocalitySetRef localitySet(new LocalitySet(*_localitygroup)); + Reference restrict(std::vector const& entryArray) { + Reference localitySet(new LocalitySet(*_localitygroup)); for (auto& entry : entryArray) { localitySet->add(getRecordViaEntry(entry), *this); } @@ -453,8 +453,8 @@ protected: // This class stores the cache record for each entry within the locality set struct LocalityCacheRecord { AttribRecord _attribute; - LocalitySetRef _resultset; - LocalityCacheRecord(AttribRecord const& attribute, LocalitySetRef resultset):_attribute(attribute),_resultset(resultset){} + Reference _resultset; + LocalityCacheRecord(AttribRecord const& attribute, Reference resultset):_attribute(attribute),_resultset(resultset){} LocalityCacheRecord(LocalityCacheRecord const& source):_attribute(source._attribute),_resultset(source._resultset){} virtual ~LocalityCacheRecord(){} LocalityCacheRecord& operator=(LocalityCacheRecord const& source) { @@ -584,7 +584,7 @@ struct LocalityMap : public LocalityGroup { virtual ~LocalityMap() {} bool selectReplicas( - IRepPolicyRef const& policy, + Reference const& policy, std::vector const& alsoServers, std::vector& entryResults, std::vector & results) @@ -601,7 +601,7 @@ struct LocalityMap : public LocalityGroup { } bool selectReplicas( - IRepPolicyRef const& policy, + Reference const& policy, std::vector const& alsoServers, std::vector & results) { @@ -610,7 +610,7 @@ struct LocalityMap : public LocalityGroup { } bool selectReplicas( - IRepPolicyRef const& policy, + Reference const& policy, std::vector & results) { return selectReplicas(policy, std::vector(), results); } diff --git a/fdbrpc/ReplicationPolicy.cpp b/fdbrpc/ReplicationPolicy.cpp index 070b8dd767..59b8f511d1 100644 --- a/fdbrpc/ReplicationPolicy.cpp +++ b/fdbrpc/ReplicationPolicy.cpp @@ -24,14 +24,14 @@ bool IReplicationPolicy::selectReplicas( - LocalitySetRef & fromServers, + Reference & fromServers, std::vector & results ) { return selectReplicas(fromServers, std::vector(), results); } bool IReplicationPolicy::validate( - LocalitySetRef const& solutionSet ) const + Reference const& solutionSet ) const { return validate(solutionSet->getEntries(), solutionSet); } @@ -40,7 +40,7 @@ bool IReplicationPolicy::validateFull( bool solved, std::vector const& solutionSet, std::vector const& alsoServers, - LocalitySetRef const& fromServers ) + Reference const& fromServers ) { bool valid = true; std::vector totalSolution(solutionSet); @@ -105,7 +105,7 @@ bool IReplicationPolicy::validateFull( } bool PolicyOne::selectReplicas( - LocalitySetRef & fromServers, + Reference & fromServers, std::vector const& alsoServers, std::vector & results ) { @@ -131,12 +131,12 @@ bool PolicyOne::selectReplicas( bool PolicyOne::validate( std::vector const& solutionSet, - LocalitySetRef const& fromServers ) const + Reference const& fromServers ) const { return ((solutionSet.size() > 0) && (fromServers->size() > 0)); } -PolicyAcross::PolicyAcross(int count, std::string const& attribKey, IRepPolicyRef const policy): +PolicyAcross::PolicyAcross(int count, std::string const& attribKey, Reference const policy): _count(count),_attribKey(attribKey),_policy(policy) { return; @@ -150,7 +150,7 @@ PolicyAcross::~PolicyAcross() // Debug purpose only // Trace all record entries to help debug // fromServers is the servers locality to be printed out. -void IReplicationPolicy::traceLocalityRecords(LocalitySetRef const& fromServers) { +void IReplicationPolicy::traceLocalityRecords(Reference const& fromServers) { std::vector> const& recordArray = fromServers->getRecordArray(); TraceEvent("LocalityRecordArray").detail("Size", recordArray.size()); for (auto& record : recordArray) { @@ -158,7 +158,7 @@ void IReplicationPolicy::traceLocalityRecords(LocalitySetRef const& fromServers) } } -void IReplicationPolicy::traceOneLocalityRecord(Reference record, LocalitySetRef const& fromServers) { +void IReplicationPolicy::traceOneLocalityRecord(Reference record, Reference const& fromServers) { int localityEntryIndex = record->_entryIndex._id; Reference const& dataMap = record->_dataMap; std::vector const& keyValueArray = dataMap->_keyvaluearray; @@ -185,7 +185,7 @@ void IReplicationPolicy::traceOneLocalityRecord(Reference record // return true if the team satisfies the policy; false otherwise bool PolicyAcross::validate( std::vector const& solutionSet, - LocalitySetRef const& fromServers ) const + Reference const& fromServers ) const { bool valid = true; int count = 0; @@ -262,7 +262,7 @@ bool PolicyAcross::validate( // that should be excluded from being selected as replicas. // FIXME: Simplify this function, such as removing unnecessary printf bool PolicyAcross::selectReplicas( - LocalitySetRef & fromServers, + Reference & fromServers, std::vector const& alsoServers, std::vector & results ) { @@ -437,7 +437,7 @@ bool PolicyAcross::selectReplicas( bool PolicyAnd::validate( std::vector const& solutionSet, - LocalitySetRef const& fromServers ) const + Reference const& fromServers ) const { bool valid = true; for (auto& policy : _policies) { @@ -450,7 +450,7 @@ bool PolicyAnd::validate( } bool PolicyAnd::selectReplicas( - LocalitySetRef & fromServers, + Reference & fromServers, std::vector const& alsoServers, std::vector & results ) { @@ -486,26 +486,26 @@ bool PolicyAnd::selectReplicas( return passed; } -void testPolicySerialization(IRepPolicyRef& policy) { +void testPolicySerialization(Reference& policy) { std::string policyInfo = policy->info(); BinaryWriter writer(IncludeVersion()); serializeReplicationPolicy(writer, policy); BinaryReader reader(writer.getData(), writer.getLength(), IncludeVersion()); - IRepPolicyRef copy; + Reference copy; serializeReplicationPolicy(reader, copy); ASSERT(policy->info() == copy->info()); } void testReplicationPolicy(int nTests) { - IRepPolicyRef policy = IRepPolicyRef(new PolicyAcross(1, "data_hall", IRepPolicyRef(new PolicyOne()))); + Reference policy = Reference(new PolicyAcross(1, "data_hall", Reference(new PolicyOne()))); testPolicySerialization(policy); - policy = IRepPolicyRef(new PolicyAnd({ - IRepPolicyRef(new PolicyAcross(2, "data_center", IRepPolicyRef(new PolicyAcross(3, "rack", IRepPolicyRef(new PolicyOne()))))), - IRepPolicyRef(new PolicyAcross(2, "data_center", IRepPolicyRef(new PolicyAcross(2, "data_hall", IRepPolicyRef(new PolicyOne()))))) + policy = Reference(new PolicyAnd({ + Reference(new PolicyAcross(2, "data_center", Reference(new PolicyAcross(3, "rack", Reference(new PolicyOne()))))), + Reference(new PolicyAcross(2, "data_center", Reference(new PolicyAcross(2, "data_hall", Reference(new PolicyOne()))))) })); testPolicySerialization(policy); diff --git a/fdbrpc/ReplicationPolicy.h b/fdbrpc/ReplicationPolicy.h index 74bc0baa80..74ccdbb312 100644 --- a/fdbrpc/ReplicationPolicy.h +++ b/fdbrpc/ReplicationPolicy.h @@ -26,7 +26,7 @@ #include "fdbrpc/ReplicationTypes.h" template -void serializeReplicationPolicy(Ar& ar, IRepPolicyRef& policy); +void serializeReplicationPolicy(Ar& ar, Reference& policy); extern void testReplicationPolicy(int nTests); @@ -40,36 +40,36 @@ struct IReplicationPolicy : public ReferenceCounted { virtual int maxResults() const = 0; virtual int depth() const = 0; virtual bool selectReplicas( - LocalitySetRef & fromServers, + Reference & fromServers, std::vector const& alsoServers, std::vector & results ) = 0; - virtual void traceLocalityRecords(LocalitySetRef const& fromServers); - virtual void traceOneLocalityRecord(Reference record, LocalitySetRef const& fromServers); + virtual void traceLocalityRecords(Reference const& fromServers); + virtual void traceOneLocalityRecord(Reference record, Reference const& fromServers); virtual bool validate( std::vector const& solutionSet, - LocalitySetRef const& fromServers ) const = 0; + Reference const& fromServers ) const = 0; bool operator == ( const IReplicationPolicy& r ) const { return info() == r.info(); } bool operator != ( const IReplicationPolicy& r ) const { return info() != r.info(); } template void serialize(Ar& ar) { - IRepPolicyRef refThis(this); + Reference refThis(this); serializeReplicationPolicy(ar, refThis); refThis->delref_no_destroy(); } // Utility functions bool selectReplicas( - LocalitySetRef & fromServers, + Reference & fromServers, std::vector & results ); bool validate( - LocalitySetRef const& solutionSet ) const; + Reference const& solutionSet ) const; bool validateFull( bool solved, std::vector const& solutionSet, std::vector const& alsoServers, - LocalitySetRef const& fromServers ); + Reference const& fromServers ); // Returns a set of the attributes that this policy uses in selection and validation. std::set attributeKeys() const @@ -78,7 +78,7 @@ struct IReplicationPolicy : public ReferenceCounted { }; template -inline void load( Archive& ar, IRepPolicyRef& value ) { +inline void load( Archive& ar, Reference& value ) { bool present = (value.getPtr()); ar >> present; if (present) { @@ -90,11 +90,11 @@ inline void load( Archive& ar, IRepPolicyRef& value ) { } template -inline void save( Archive& ar, const IRepPolicyRef& value ) { +inline void save( Archive& ar, const Reference& value ) { bool present = (value.getPtr()); ar << present; if (present) { - serializeReplicationPolicy(ar, (IRepPolicyRef&) value); + serializeReplicationPolicy(ar, (Reference&) value); } } @@ -107,9 +107,9 @@ struct PolicyOne : IReplicationPolicy, public ReferenceCounted { virtual int depth() const { return 1; } virtual bool validate( std::vector const& solutionSet, - LocalitySetRef const& fromServers ) const; + Reference const& fromServers ) const; virtual bool selectReplicas( - LocalitySetRef & fromServers, + Reference & fromServers, std::vector const& alsoServers, std::vector & results ); template @@ -119,7 +119,7 @@ struct PolicyOne : IReplicationPolicy, public ReferenceCounted { }; struct PolicyAcross : IReplicationPolicy, public ReferenceCounted { - PolicyAcross(int count, std::string const& attribKey, IRepPolicyRef const policy); + PolicyAcross(int count, std::string const& attribKey, Reference const policy); virtual ~PolicyAcross(); virtual std::string name() const { return "Across"; } virtual std::string info() const @@ -128,9 +128,9 @@ struct PolicyAcross : IReplicationPolicy, public ReferenceCounted virtual int depth() const { return 1 + _policy->depth(); } virtual bool validate( std::vector const& solutionSet, - LocalitySetRef const& fromServers ) const; + Reference const& fromServers ) const; virtual bool selectReplicas( - LocalitySetRef & fromServers, + Reference & fromServers, std::vector const& alsoServers, std::vector & results ); @@ -149,18 +149,18 @@ struct PolicyAcross : IReplicationPolicy, public ReferenceCounted protected: int _count; std::string _attribKey; - IRepPolicyRef _policy; + Reference _policy; // Cache temporary members std::vector _usedValues; std::vector _newResults; - LocalitySetRef _selected; + Reference _selected; VectorRef> _addedResults; Arena _arena; }; struct PolicyAnd : IReplicationPolicy, public ReferenceCounted { - PolicyAnd(std::vector policies): _policies(policies), _sortedPolicies(policies) + PolicyAnd(std::vector> policies): _policies(policies), _sortedPolicies(policies) { // Sort the policy array std::sort(_sortedPolicies.begin(), _sortedPolicies.end(), PolicyAnd::comparePolicy); @@ -194,14 +194,14 @@ struct PolicyAnd : IReplicationPolicy, public ReferenceCounted { } virtual bool validate( std::vector const& solutionSet, - LocalitySetRef const& fromServers ) const; + Reference const& fromServers ) const; virtual bool selectReplicas( - LocalitySetRef & fromServers, + Reference & fromServers, std::vector const& alsoServers, std::vector & results ); - static bool comparePolicy(const IRepPolicyRef& rhs, const IRepPolicyRef& lhs) + static bool comparePolicy(const Reference& rhs, const Reference& lhs) { return (lhs->maxResults() < rhs->maxResults()) || (!(rhs->maxResults() < lhs->maxResults()) && (lhs->depth() < rhs->depth())); } template @@ -219,18 +219,18 @@ struct PolicyAnd : IReplicationPolicy, public ReferenceCounted { } virtual void attributeKeys(std::set *set) const override - { for (const IRepPolicyRef& r : _policies) { r->attributeKeys(set); } } + { for (const Reference& r : _policies) { r->attributeKeys(set); } } protected: - std::vector _policies; - std::vector _sortedPolicies; + std::vector> _policies; + std::vector> _sortedPolicies; }; extern int testReplication(); template -void serializeReplicationPolicy(Ar& ar, IRepPolicyRef& policy) { +void serializeReplicationPolicy(Ar& ar, Reference& policy) { if(Ar::isDeserializing) { StringRef name; serializer(ar, name); @@ -238,20 +238,20 @@ void serializeReplicationPolicy(Ar& ar, IRepPolicyRef& policy) { if(name == LiteralStringRef("One")) { PolicyOne* pointer = new PolicyOne(); pointer->serialize(ar); - policy = IRepPolicyRef(pointer); + policy = Reference(pointer); } else if(name == LiteralStringRef("Across")) { - PolicyAcross* pointer = new PolicyAcross(0, "", IRepPolicyRef()); + PolicyAcross* pointer = new PolicyAcross(0, "", Reference()); pointer->serialize(ar); - policy = IRepPolicyRef(pointer); + policy = Reference(pointer); } else if(name == LiteralStringRef("And")) { PolicyAnd* pointer = new PolicyAnd({}); pointer->serialize(ar); - policy = IRepPolicyRef(pointer); + policy = Reference(pointer); } else if(name == LiteralStringRef("None")) { - policy = IRepPolicyRef(); + policy = Reference(); } else { TraceEvent(SevError, "SerializingInvalidPolicyType") diff --git a/fdbrpc/ReplicationTypes.h b/fdbrpc/ReplicationTypes.h index ef5463f54b..9a9f517d15 100644 --- a/fdbrpc/ReplicationTypes.h +++ b/fdbrpc/ReplicationTypes.h @@ -34,9 +34,6 @@ struct LocalityRecord; struct StringToIntMap; struct IReplicationPolicy; -typedef Reference LocalitySetRef; -typedef Reference IRepPolicyRef; - extern int g_replicationdebug; struct AttribKey { diff --git a/fdbrpc/ReplicationUtils.cpp b/fdbrpc/ReplicationUtils.cpp index ae92fd7950..d2c7e734a0 100644 --- a/fdbrpc/ReplicationUtils.cpp +++ b/fdbrpc/ReplicationUtils.cpp @@ -27,8 +27,8 @@ double ratePolicy( - LocalitySetRef & localitySet, - IRepPolicyRef const& policy, + Reference & localitySet, + Reference const& policy, unsigned int nTestTotal) { double rating = -1.0; @@ -85,14 +85,14 @@ double ratePolicy( bool findBestPolicySet( std::vector& bestResults, - LocalitySetRef & localitySet, - IRepPolicyRef const& policy, + Reference & localitySet, + Reference const& policy, unsigned int nMinItems, unsigned int nSelectTests, unsigned int nPolicyTests) { bool bSucceeded = true; - LocalitySetRef bestLocalitySet, testLocalitySet; + Reference bestLocalitySet, testLocalitySet; std::vector results; double testRate, bestRate = -1.0; @@ -162,15 +162,15 @@ bool findBestPolicySet( bool findBestUniquePolicySet( std::vector& bestResults, - LocalitySetRef & localitySet, - IRepPolicyRef const& policy, + Reference & localitySet, + Reference const& policy, StringRef localityUniquenessKey, unsigned int nMinItems, unsigned int nSelectTests, unsigned int nPolicyTests) { bool bSucceeded = true; - LocalitySetRef bestLocalitySet, testLocalitySet; + Reference bestLocalitySet, testLocalitySet; std::vector results; double testRate, bestRate = -1.0; @@ -262,7 +262,7 @@ bool findBestUniquePolicySet( bool validateAllCombinations( std::vector & offendingCombo, LocalityGroup const& localitySet, - IRepPolicyRef const& policy, + Reference const& policy, std::vector const& newItems, unsigned int nCombinationSize, bool bCheckIfValid) @@ -286,12 +286,12 @@ bool validateAllCombinations( } else { - bool bIsValidGroup; + bool bIsValidGroup; LocalityGroup localityGroup; std::string bitmask(nCombinationSize, 1); // K leading 1's bitmask.resize(newItems.size(), 0); // N-K trailing 0's - + do { localityGroup.deep_copy(localitySet); @@ -337,7 +337,7 @@ bool validateAllCombinations( bool validateAllCombinations( LocalityGroup const& localitySet, - IRepPolicyRef const& policy, + Reference const& policy, std::vector const& newItems, unsigned int nCombinationSize, bool bCheckIfValid) @@ -358,10 +358,10 @@ repTestType convertToTestType(int iValue) { return sValue; } -LocalitySetRef createTestLocalityMap(std::vector& indexes, int dcTotal, +Reference createTestLocalityMap(std::vector& indexes, int dcTotal, int szTotal, int rackTotal, int slotTotal, int independentItems, int independentTotal) { - LocalitySetRef buildServer(new LocalityMap()); + Reference buildServer(new LocalityMap()); LocalityMap* serverMap = (LocalityMap*) buildServer.getPtr(); int serverValue, dcLoop, szLoop, rackLoop, slotLoop; std::string dcText, szText, rackText, slotText, independentName, independentText; @@ -442,8 +442,8 @@ LocalitySetRef createTestLocalityMap(std::vector& indexes, int dcTo } bool testPolicy( - LocalitySetRef servers, - IRepPolicyRef const& policy, + Reference servers, + Reference const& policy, std::vector const& including, bool validate) { @@ -506,109 +506,109 @@ bool testPolicy( } bool testPolicy( - LocalitySetRef servers, - IRepPolicyRef const& policy, + Reference servers, + Reference const& policy, bool validate) { return testPolicy(servers, policy, emptyEntryArray, validate); } -std::vector const& getStaticPolicies() +std::vector> const& getStaticPolicies() { - static std::vector staticPolicies; + static std::vector> staticPolicies; if (staticPolicies.empty()) { staticPolicies = { - IRepPolicyRef( new PolicyOne() ), + Reference( new PolicyOne() ), // 1 'dc^2 x 1' - IRepPolicyRef( new PolicyAcross(2, "dc", IRepPolicyRef( new PolicyOne() ) ) ), + Reference( new PolicyAcross(2, "dc", Reference( new PolicyOne() ) ) ), // 2 'dc^3 x 1' - IRepPolicyRef( new PolicyAcross(3, "dc", IRepPolicyRef( new PolicyOne() ) ) ), + Reference( new PolicyAcross(3, "dc", Reference( new PolicyOne() ) ) ), // 3 'sz^3 x 1' - IRepPolicyRef( new PolicyAcross(3, "sz", IRepPolicyRef( new PolicyOne() ) ) ), + Reference( new PolicyAcross(3, "sz", Reference( new PolicyOne() ) ) ), // 4 'dc^1 x az^3 x 1' - IRepPolicyRef( new PolicyAcross(1, "dc", IRepPolicyRef( new PolicyAcross(3, "az", IRepPolicyRef( new PolicyOne() ))) ) ), + Reference( new PolicyAcross(1, "dc", Reference( new PolicyAcross(3, "az", Reference( new PolicyOne() ))) ) ), // 5 '(sz^3 x rack^2 x 1) + (dc^2 x az^3 x 1)' - IRepPolicyRef( new PolicyAnd( { IRepPolicyRef(new PolicyAcross(3, "sz", IRepPolicyRef(new PolicyAcross(2, "rack", IRepPolicyRef(new PolicyOne() ))))), IRepPolicyRef(new PolicyAcross(2, "dc", IRepPolicyRef(new PolicyAcross(3, "az", IRepPolicyRef(new PolicyOne()) ))) )} ) ), + Reference( new PolicyAnd( { Reference(new PolicyAcross(3, "sz", Reference(new PolicyAcross(2, "rack", Reference(new PolicyOne() ))))), Reference(new PolicyAcross(2, "dc", Reference(new PolicyAcross(3, "az", Reference(new PolicyOne()) ))) )} ) ), // 6 '(sz^1 x 1)' - IRepPolicyRef( new PolicyAcross(1, "sz", IRepPolicyRef(new PolicyOne())) ), + Reference( new PolicyAcross(1, "sz", Reference(new PolicyOne())) ), // 7 '(sz^1 x 1) + (sz^1 x 1)' - IRepPolicyRef( new PolicyAnd( { IRepPolicyRef(new PolicyAcross(1, "sz", IRepPolicyRef(new PolicyOne()))), IRepPolicyRef(new PolicyAcross(1, "sz", IRepPolicyRef(new PolicyOne()))) } ) ), + Reference( new PolicyAnd( { Reference(new PolicyAcross(1, "sz", Reference(new PolicyOne()))), Reference(new PolicyAcross(1, "sz", Reference(new PolicyOne()))) } ) ), // 8 '(sz^2 x 1) + (sz^2 x 1)' - IRepPolicyRef( new PolicyAnd( { IRepPolicyRef(new PolicyAcross(2, "sz", IRepPolicyRef(new PolicyOne()))), IRepPolicyRef(new PolicyAcross(2, "sz", IRepPolicyRef(new PolicyOne()))) } ) ), + Reference( new PolicyAnd( { Reference(new PolicyAcross(2, "sz", Reference(new PolicyOne()))), Reference(new PolicyAcross(2, "sz", Reference(new PolicyOne()))) } ) ), // 9 '(dc^1 x sz^2 x 1)' - IRepPolicyRef( new PolicyAcross(1, "dc", IRepPolicyRef( new PolicyAcross(2, "sz", IRepPolicyRef(new PolicyOne()))))), + Reference( new PolicyAcross(1, "dc", Reference( new PolicyAcross(2, "sz", Reference(new PolicyOne()))))), //10 '(dc^2 x sz^2 x 1)' - IRepPolicyRef( new PolicyAcross(2, "dc", IRepPolicyRef( new PolicyAcross(2, "sz", IRepPolicyRef(new PolicyOne()))))), + Reference( new PolicyAcross(2, "dc", Reference( new PolicyAcross(2, "sz", Reference(new PolicyOne()))))), //11 '(dc^1 x sz^2 x 1) + (dc^2 x sz^2 x 1)' - IRepPolicyRef( new PolicyAnd( { IRepPolicyRef(new PolicyAcross(1, "dc", IRepPolicyRef( new PolicyAcross(2, "sz", IRepPolicyRef(new PolicyOne()))))), IRepPolicyRef(new PolicyAcross(2, "dc", IRepPolicyRef( new PolicyAcross(2, "sz", IRepPolicyRef(new PolicyOne()))))) } ) ), + Reference( new PolicyAnd( { Reference(new PolicyAcross(1, "dc", Reference( new PolicyAcross(2, "sz", Reference(new PolicyOne()))))), Reference(new PolicyAcross(2, "dc", Reference( new PolicyAcross(2, "sz", Reference(new PolicyOne()))))) } ) ), //12 '(dc^2 x sz^2 x 1) + (dc^1 x sz^2 x 1)' - IRepPolicyRef( new PolicyAnd( { IRepPolicyRef(new PolicyAcross(2, "dc", IRepPolicyRef( new PolicyAcross(2, "sz", IRepPolicyRef(new PolicyOne()))))), IRepPolicyRef(new PolicyAcross(1, "dc", IRepPolicyRef( new PolicyAcross(2, "sz", IRepPolicyRef(new PolicyOne()))))) } ) ), + Reference( new PolicyAnd( { Reference(new PolicyAcross(2, "dc", Reference( new PolicyAcross(2, "sz", Reference(new PolicyOne()))))), Reference(new PolicyAcross(1, "dc", Reference( new PolicyAcross(2, "sz", Reference(new PolicyOne()))))) } ) ), //13 '(sz^2 x 1) + (dc^1 x sz^2 x 1)' - IRepPolicyRef( new PolicyAnd( { IRepPolicyRef(new PolicyAcross(2, "sz", IRepPolicyRef(new PolicyOne()))), IRepPolicyRef(new PolicyAcross(1, "dc", IRepPolicyRef( new PolicyAcross(2, "sz", IRepPolicyRef(new PolicyOne()))))) } ) ), + Reference( new PolicyAnd( { Reference(new PolicyAcross(2, "sz", Reference(new PolicyOne()))), Reference(new PolicyAcross(1, "dc", Reference( new PolicyAcross(2, "sz", Reference(new PolicyOne()))))) } ) ), //14 '(sz^2 x 1) + (dc^2 x sz^2 x 1)' - IRepPolicyRef( new PolicyAnd( { IRepPolicyRef(new PolicyAcross(2, "sz", IRepPolicyRef(new PolicyOne()))), IRepPolicyRef(new PolicyAcross(2, "dc", IRepPolicyRef( new PolicyAcross(2, "sz", IRepPolicyRef(new PolicyOne()))))) } ) ), + Reference( new PolicyAnd( { Reference(new PolicyAcross(2, "sz", Reference(new PolicyOne()))), Reference(new PolicyAcross(2, "dc", Reference( new PolicyAcross(2, "sz", Reference(new PolicyOne()))))) } ) ), //15 '(sz^3 x 1) + (dc^2 x sz^2 x 1)' - IRepPolicyRef( new PolicyAnd( { IRepPolicyRef(new PolicyAcross(3, "sz", IRepPolicyRef(new PolicyOne()))), IRepPolicyRef(new PolicyAcross(2, "dc", IRepPolicyRef( new PolicyAcross(2, "sz", IRepPolicyRef(new PolicyOne()))))) } ) ), + Reference( new PolicyAnd( { Reference(new PolicyAcross(3, "sz", Reference(new PolicyOne()))), Reference(new PolicyAcross(2, "dc", Reference( new PolicyAcross(2, "sz", Reference(new PolicyOne()))))) } ) ), //16 '(sz^1 x 1) + (sz^2 x 1)' - IRepPolicyRef( new PolicyAnd( { IRepPolicyRef(new PolicyAcross(1, "sz", IRepPolicyRef(new PolicyOne()))), IRepPolicyRef(new PolicyAcross(2, "sz", IRepPolicyRef(new PolicyOne()))) } ) ), + Reference( new PolicyAnd( { Reference(new PolicyAcross(1, "sz", Reference(new PolicyOne()))), Reference(new PolicyAcross(2, "sz", Reference(new PolicyOne()))) } ) ), //17 '(sz^2 x 1) + (sz^3 x 1)' - IRepPolicyRef( new PolicyAnd( { IRepPolicyRef(new PolicyAcross(2, "sz", IRepPolicyRef(new PolicyOne()))), IRepPolicyRef(new PolicyAcross(3, "sz", IRepPolicyRef(new PolicyOne()))) } ) ), + Reference( new PolicyAnd( { Reference(new PolicyAcross(2, "sz", Reference(new PolicyOne()))), Reference(new PolicyAcross(3, "sz", Reference(new PolicyOne()))) } ) ), //18 '(sz^1 x 1) + (sz^2 x 1) + (sz^3 x 1)' - IRepPolicyRef( new PolicyAnd( { IRepPolicyRef(new PolicyAcross(1, "sz", IRepPolicyRef(new PolicyOne()))), IRepPolicyRef(new PolicyAcross(2, "sz", IRepPolicyRef(new PolicyOne()))), IRepPolicyRef(new PolicyAcross(3, "sz", IRepPolicyRef(new PolicyOne()))) } ) ), + Reference( new PolicyAnd( { Reference(new PolicyAcross(1, "sz", Reference(new PolicyOne()))), Reference(new PolicyAcross(2, "sz", Reference(new PolicyOne()))), Reference(new PolicyAcross(3, "sz", Reference(new PolicyOne()))) } ) ), //19 '(sz^1 x 1) + (machine^1 x 1)' - IRepPolicyRef( new PolicyAnd( { IRepPolicyRef(new PolicyAcross(1, "sz", IRepPolicyRef(new PolicyOne()))), IRepPolicyRef(new PolicyAcross(1, "zoneid", IRepPolicyRef(new PolicyOne()))) } ) ), + Reference( new PolicyAnd( { Reference(new PolicyAcross(1, "sz", Reference(new PolicyOne()))), Reference(new PolicyAcross(1, "zoneid", Reference(new PolicyOne()))) } ) ), // '(dc^1 x 1) + (sz^1 x 1) + (machine^1 x 1)' - // IRepPolicyRef( new PolicyAnd( { IRepPolicyRef(new PolicyAcross(1, "dc", IRepPolicyRef(new PolicyOne()))), IRepPolicyRef(new PolicyAcross(1, "sz", IRepPolicyRef(new PolicyOne()))), IRepPolicyRef(new PolicyAcross(1, "zoneid", IRepPolicyRef(new PolicyOne()))) } ) ), + // Reference( new PolicyAnd( { Reference(new PolicyAcross(1, "dc", Reference(new PolicyOne()))), Reference(new PolicyAcross(1, "sz", Reference(new PolicyOne()))), Reference(new PolicyAcross(1, "zoneid", Reference(new PolicyOne()))) } ) ), // '(dc^1 x sz^3 x 1)' - IRepPolicyRef( new PolicyAcross(1, "dc", IRepPolicyRef( new PolicyAcross(3, "sz", IRepPolicyRef(new PolicyOne())))) ), + Reference( new PolicyAcross(1, "dc", Reference( new PolicyAcross(3, "sz", Reference(new PolicyOne())))) ), // '(dc^2 x sz^3 x 1)' - IRepPolicyRef( new PolicyAcross(2, "dc", IRepPolicyRef( new PolicyAcross(3, "sz", IRepPolicyRef(new PolicyOne())))) ), + Reference( new PolicyAcross(2, "dc", Reference( new PolicyAcross(3, "sz", Reference(new PolicyOne())))) ), // '(dc^2 x az^3 x 1)' - IRepPolicyRef( new PolicyAcross(2, "dc", IRepPolicyRef( new PolicyAcross(3, "az", IRepPolicyRef(new PolicyOne())))) ), + Reference( new PolicyAcross(2, "dc", Reference( new PolicyAcross(3, "az", Reference(new PolicyOne())))) ), // '(sz^1 x 1) + (dc^2 x az^3 x 1)' - IRepPolicyRef( new PolicyAnd({IRepPolicyRef(new PolicyAcross(1, "sz", IRepPolicyRef(new PolicyOne()))), IRepPolicyRef(new PolicyAcross(2, "dc", IRepPolicyRef( new PolicyAcross(3, "az", IRepPolicyRef(new PolicyOne())))))}) ), + Reference( new PolicyAnd({Reference(new PolicyAcross(1, "sz", Reference(new PolicyOne()))), Reference(new PolicyAcross(2, "dc", Reference( new PolicyAcross(3, "az", Reference(new PolicyOne())))))}) ), // 'dc^1 x (az^2 x 1) + (sz^2 x 1)' - // IRepPolicyRef( new PolicyAcross(1, "dc", IRepPolicyRef(new PolicyAnd({IRepPolicyRef(new PolicyAcross(2, "az", IRepPolicyRef(new PolicyOne()))), IRepPolicyRef(new PolicyAcross(2, "sz", IRepPolicyRef(new PolicyOne())))}))) ), + // Reference( new PolicyAcross(1, "dc", Reference(new PolicyAnd({Reference(new PolicyAcross(2, "az", Reference(new PolicyOne()))), Reference(new PolicyAcross(2, "sz", Reference(new PolicyOne())))}))) ), // Require backtracking - IRepPolicyRef( new PolicyAcross(8, "zoneid", IRepPolicyRef(new PolicyAcross(1, "az", IRepPolicyRef(new PolicyOne()))) ) ), - IRepPolicyRef( new PolicyAcross(8, "zoneid", IRepPolicyRef(new PolicyAcross(1, "sz", IRepPolicyRef(new PolicyOne()))) ) ) + Reference( new PolicyAcross(8, "zoneid", Reference(new PolicyAcross(1, "az", Reference(new PolicyOne()))) ) ), + Reference( new PolicyAcross(8, "zoneid", Reference(new PolicyAcross(1, "sz", Reference(new PolicyOne()))) ) ) }; } return staticPolicies; } -IRepPolicyRef const randomAcrossPolicy(LocalitySet const& serverSet) +Reference const randomAcrossPolicy(LocalitySet const& serverSet) { int usedKeyTotal, keysUsed, keyIndex, valueTotal, maxValueTotal, maxKeyTotal, skips, lastKeyIndex; std::vector keyArray(serverSet.getGroupKeyMap()->_lookuparray); @@ -616,7 +616,7 @@ IRepPolicyRef const randomAcrossPolicy(LocalitySet const& serverSet) AttribKey indexKey; Optional keyValue; std::string keyText; - IRepPolicyRef policy(new PolicyOne()); + Reference policy(new PolicyOne()); // Determine the number of keys to used within the policy usedKeyTotal = g_random->randomInt(1, keyArray.size()+1); @@ -669,7 +669,7 @@ IRepPolicyRef const randomAcrossPolicy(LocalitySet const& serverSet) } valueTotal = g_random->randomInt(1, valueSet.size()+2); if ((valueTotal > maxValueTotal) && (g_random->random01() > .25)) valueTotal = maxValueTotal; - policy = IRepPolicyRef( new PolicyAcross(valueTotal, keyText, policy) ); + policy = Reference( new PolicyAcross(valueTotal, keyText, policy) ); if (g_replicationdebug > 1) { printf(" item%3d: (%3d =>%3d) %-10s =>%4d\n", keysUsed+1, keyIndex, indexKey._id, keyText.c_str(), valueTotal); } @@ -725,8 +725,8 @@ int testReplication() int policyMin = policyMinEnv ? atoi(policyMinEnv) : 2; int policyIndex, testCounter, alsoSize, debugBackup, maxAlsoSize; std::vector serverIndexes; - LocalitySetRef testServers; - std::vector policies; + Reference testServers; + std::vector> policies; std::vector alsoServers, bestSet; int totalErrors = 0; @@ -819,12 +819,12 @@ void filterLocalityDataForPolicy(const std::set& keys, LocalityData } } -void filterLocalityDataForPolicy(IRepPolicyRef policy, LocalityData* ld) { +void filterLocalityDataForPolicy(Reference policy, LocalityData* ld) { if (!policy) return; filterLocalityDataForPolicy(policy->attributeKeys(), ld); } -void filterLocalityDataForPolicy(IRepPolicyRef policy, std::vector* vld) { +void filterLocalityDataForPolicy(Reference policy, std::vector* vld) { if (!policy) return; std::set keys = policy->attributeKeys(); for (LocalityData& ld : *vld) { diff --git a/fdbrpc/ReplicationUtils.h b/fdbrpc/ReplicationUtils.h index f359e7489f..f9f1987e78 100644 --- a/fdbrpc/ReplicationUtils.h +++ b/fdbrpc/ReplicationUtils.h @@ -34,22 +34,22 @@ extern repTestType convertToTestType(int iValue); extern int testReplication(); extern double ratePolicy( - LocalitySetRef & localitySet, - IRepPolicyRef const& policy, + Reference & localitySet, + Reference const& policy, unsigned int nSelectTests); extern bool findBestPolicySet( std::vector& bestResults, - LocalitySetRef & localitySet, - IRepPolicyRef const& policy, + Reference & localitySet, + Reference const& policy, unsigned int nMinItems, unsigned int nSelectTests, unsigned int nPolicyTests); extern bool findBestUniquePolicySet( std::vector& bestResults, - LocalitySetRef & localitySet, - IRepPolicyRef const& policy, + Reference & localitySet, + Reference const& policy, StringRef localityUniquenessKey, unsigned int nMinItems, unsigned int nSelectTests, @@ -60,20 +60,20 @@ extern bool findBestUniquePolicySet( extern bool validateAllCombinations( std::vector & offendingCombo, LocalityGroup const& localitySet, - IRepPolicyRef const& policy, + Reference const& policy, std::vector const& newItems, unsigned int nCombinationSize, bool bCheckIfValid = true); extern bool validateAllCombinations( LocalityGroup const& localitySet, - IRepPolicyRef const& policy, + Reference const& policy, std::vector const& newItems, unsigned int nCombinationSize, bool bCheckIfValid = true); /// Remove all pieces of locality information from the LocalityData that will not be used when validating the policy. -void filterLocalityDataForPolicy(IRepPolicyRef policy, LocalityData* ld); -void filterLocalityDataForPolicy(IRepPolicyRef policy, std::vector* vld); +void filterLocalityDataForPolicy(Reference policy, LocalityData* ld); +void filterLocalityDataForPolicy(Reference policy, std::vector* vld); #endif diff --git a/fdbrpc/simulator.h b/fdbrpc/simulator.h index 2987c80655..7cb645e70f 100644 --- a/fdbrpc/simulator.h +++ b/fdbrpc/simulator.h @@ -280,11 +280,11 @@ public: std::set protectedAddresses; std::map currentlyRebootingProcesses; class ClusterConnectionString* extraDB; - IRepPolicyRef storagePolicy; - IRepPolicyRef tLogPolicy; + Reference storagePolicy; + Reference tLogPolicy; int32_t tLogWriteAntiQuorum; Optional> primaryDcId; - IRepPolicyRef remoteTLogPolicy; + Reference remoteTLogPolicy; int32_t usableRegions; std::string disablePrimary; std::string disableRemote; @@ -292,8 +292,8 @@ public: bool allowLogSetKills; Optional> remoteDcId; bool hasSatelliteReplication; - IRepPolicyRef satelliteTLogPolicy; - IRepPolicyRef satelliteTLogPolicyFallback; + Reference satelliteTLogPolicy; + Reference satelliteTLogPolicyFallback; int32_t satelliteTLogWriteAntiQuorum; int32_t satelliteTLogWriteAntiQuorumFallback; std::vector>> primarySatelliteDcIds; diff --git a/fdbserver/ClusterController.actor.cpp b/fdbserver/ClusterController.actor.cpp index ca805e0db1..40556f0c40 100644 --- a/fdbserver/ClusterController.actor.cpp +++ b/fdbserver/ClusterController.actor.cpp @@ -234,10 +234,10 @@ public: throw no_more_servers(); } - std::vector getWorkersForSeedServers( DatabaseConfiguration const& conf, IRepPolicyRef const& policy, Optional>> const& dcId = Optional>>() ) { + std::vector getWorkersForSeedServers( DatabaseConfiguration const& conf, Reference const& policy, Optional>> const& dcId = Optional>>() ) { std::map> fitness_workers; std::vector results; - LocalitySetRef logServerSet = Reference(new LocalityMap()); + Reference logServerSet = Reference(new LocalityMap()); LocalityMap* logServerMap = (LocalityMap*) logServerSet.getPtr(); bool bCompleted = false; @@ -275,11 +275,11 @@ public: return results; } - std::vector getWorkersForTlogs( DatabaseConfiguration const& conf, int32_t required, int32_t desired, IRepPolicyRef const& policy, std::map< Optional>, int>& id_used, bool checkStable = false, std::set> dcIds = std::set>() ) { + std::vector getWorkersForTlogs( DatabaseConfiguration const& conf, int32_t required, int32_t desired, Reference const& policy, std::map< Optional>, int>& id_used, bool checkStable = false, std::set> dcIds = std::set>() ) { std::map, vector> fitness_workers; std::vector results; std::vector unavailableLocals; - LocalitySetRef logServerSet; + Reference logServerSet; LocalityMap* logServerMap; bool bCompleted = false; diff --git a/fdbserver/DBCoreState.h b/fdbserver/DBCoreState.h index 1b6d3e3bc1..ebaaff8c73 100644 --- a/fdbserver/DBCoreState.h +++ b/fdbserver/DBCoreState.h @@ -41,7 +41,7 @@ struct CoreTLogSet { int32_t tLogWriteAntiQuorum; // The write anti quorum previously used to write to tLogs, which might be different from the anti quorum suggested by the current configuration going forward! int32_t tLogReplicationFactor; // The replication factor previously used to write to tLogs, which might be different from the current configuration std::vector< LocalityData > tLogLocalities; // Stores the localities of the log servers - IRepPolicyRef tLogPolicy; + Reference tLogPolicy; bool isLocal; int8_t locality; Version startVersion; diff --git a/fdbserver/DataDistribution.actor.cpp b/fdbserver/DataDistribution.actor.cpp index f57e5c6b13..ead66847d1 100644 --- a/fdbserver/DataDistribution.actor.cpp +++ b/fdbserver/DataDistribution.actor.cpp @@ -3698,7 +3698,7 @@ ACTOR Future dataDistributor(DataDistributorInterface di, Reference policy, int processCount) { Database database = DatabaseContext::create( Reference>(new AsyncVar()), Never(), @@ -3740,7 +3740,7 @@ DDTeamCollection* testTeamCollection(int teamSize, IRepPolicyRef policy, int pro return collection; } -DDTeamCollection* testMachineTeamCollection(int teamSize, IRepPolicyRef policy, int processCount) { +DDTeamCollection* testMachineTeamCollection(int teamSize, Reference policy, int processCount) { Database database = DatabaseContext::create(Reference>(new AsyncVar()), Never(), LocalityData(), false); @@ -3792,7 +3792,7 @@ TEST_CASE("DataDistribution/AddTeamsBestOf/UseMachineID") { int desiredTeams = SERVER_KNOBS->DESIRED_TEAMS_PER_SERVER * processSize; int maxTeams = SERVER_KNOBS->MAX_TEAMS_PER_SERVER * processSize; - IRepPolicyRef policy = IRepPolicyRef(new PolicyAcross(teamSize, "zoneid", IRepPolicyRef(new PolicyOne()))); + Reference policy = Reference(new PolicyAcross(teamSize, "zoneid", Reference(new PolicyOne()))); state DDTeamCollection* collection = testMachineTeamCollection(teamSize, policy, processSize); int result = collection->addTeamsBestOf(30, desiredTeams, maxTeams); @@ -3812,7 +3812,7 @@ TEST_CASE("DataDistribution/AddTeamsBestOf/NotUseMachineID") { int desiredTeams = SERVER_KNOBS->DESIRED_TEAMS_PER_SERVER * processSize; int maxTeams = SERVER_KNOBS->MAX_TEAMS_PER_SERVER * processSize; - IRepPolicyRef policy = IRepPolicyRef(new PolicyAcross(teamSize, "zoneid", IRepPolicyRef(new PolicyOne()))); + Reference policy = Reference(new PolicyAcross(teamSize, "zoneid", Reference(new PolicyOne()))); state DDTeamCollection* collection = testMachineTeamCollection(teamSize, policy, processSize); if (collection == NULL) { @@ -3830,7 +3830,7 @@ TEST_CASE("DataDistribution/AddTeamsBestOf/NotUseMachineID") { } TEST_CASE("DataDistribution/AddAllTeams/isExhaustive") { - IRepPolicyRef policy = IRepPolicyRef(new PolicyAcross(3, "zoneid", IRepPolicyRef(new PolicyOne()))); + Reference policy = Reference(new PolicyAcross(3, "zoneid", Reference(new PolicyOne()))); state int processSize = 10; state int desiredTeams = SERVER_KNOBS->DESIRED_TEAMS_PER_SERVER * processSize; state int maxTeams = SERVER_KNOBS->MAX_TEAMS_PER_SERVER * processSize; @@ -3849,7 +3849,7 @@ TEST_CASE("DataDistribution/AddAllTeams/isExhaustive") { } TEST_CASE("/DataDistribution/AddAllTeams/withLimit") { - IRepPolicyRef policy = IRepPolicyRef(new PolicyAcross(3, "zoneid", IRepPolicyRef(new PolicyOne()))); + Reference policy = Reference(new PolicyAcross(3, "zoneid", Reference(new PolicyOne()))); state int processSize = 10; state int desiredTeams = SERVER_KNOBS->DESIRED_TEAMS_PER_SERVER * processSize; state int maxTeams = SERVER_KNOBS->MAX_TEAMS_PER_SERVER * processSize; @@ -3867,7 +3867,7 @@ TEST_CASE("/DataDistribution/AddAllTeams/withLimit") { TEST_CASE("/DataDistribution/AddTeamsBestOf/SkippingBusyServers") { wait(Future(Void())); - IRepPolicyRef policy = IRepPolicyRef(new PolicyAcross(3, "zoneid", IRepPolicyRef(new PolicyOne()))); + Reference policy = Reference(new PolicyAcross(3, "zoneid", Reference(new PolicyOne()))); state int processSize = 10; state int desiredTeams = SERVER_KNOBS->DESIRED_TEAMS_PER_SERVER * processSize; state int maxTeams = SERVER_KNOBS->MAX_TEAMS_PER_SERVER * processSize; @@ -3897,7 +3897,7 @@ TEST_CASE("/DataDistribution/AddTeamsBestOf/SkippingBusyServers") { TEST_CASE("/DataDistribution/AddTeamsBestOf/NotEnoughServers") { wait(Future(Void())); - IRepPolicyRef policy = IRepPolicyRef(new PolicyAcross(3, "zoneid", IRepPolicyRef(new PolicyOne()))); + Reference policy = Reference(new PolicyAcross(3, "zoneid", Reference(new PolicyOne()))); state int processSize = 5; state int desiredTeams = SERVER_KNOBS->DESIRED_TEAMS_PER_SERVER * processSize; state int maxTeams = SERVER_KNOBS->MAX_TEAMS_PER_SERVER * processSize; diff --git a/fdbserver/LogSystem.h b/fdbserver/LogSystem.h index 8b8dc7e8dc..3ed53b0475 100644 --- a/fdbserver/LogSystem.h +++ b/fdbserver/LogSystem.h @@ -40,8 +40,8 @@ public: int32_t tLogReplicationFactor; std::vector< LocalityData > tLogLocalities; // Stores the localities of the log servers TLogVersion tLogVersion; - IRepPolicyRef tLogPolicy; - LocalitySetRef logServerSet; + Reference tLogPolicy; + Reference logServerSet; std::vector logIndexArray; std::vector logEntryArray; bool isLocal; @@ -84,7 +84,7 @@ public: used_servers.insert(std::make_pair(0,i)); } - LocalitySetRef serverSet = Reference(new LocalityMap>()); + Reference serverSet = Reference(new LocalityMap>()); LocalityMap>* serverMap = (LocalityMap>*) serverSet.getPtr(); std::vector> resultPairs; for(int loc = 0; loc < satelliteTagLocations.size(); loc++) { @@ -189,7 +189,7 @@ public: void updateLocalitySet( vector const& localities ) { LocalityMap* logServerMap; - logServerSet = LocalitySetRef(new LocalityMap()); + logServerSet = Reference(new LocalityMap()); logServerMap = (LocalityMap*) logServerSet.getPtr(); logEntryArray.clear(); @@ -412,7 +412,7 @@ struct ILogSystem { int tLogReplicationFactor; MergedPeekCursor( vector< Reference > const& serverCursors, Version begin ); - MergedPeekCursor( std::vector>>> const& logServers, int bestServer, int readQuorum, Tag tag, Version begin, Version end, bool parallelGetMore, std::vector const& tLogLocalities, IRepPolicyRef const tLogPolicy, int tLogReplicationFactor ); + MergedPeekCursor( std::vector>>> const& logServers, int bestServer, int readQuorum, Tag tag, Version begin, Version end, bool parallelGetMore, std::vector const& tLogLocalities, Reference const tLogPolicy, int tLogReplicationFactor ); MergedPeekCursor( vector< Reference > const& serverCursors, LogMessageVersion const& messageVersion, int bestServer, int readQuorum, Optional nextVersion, Reference logSet, int tLogReplicationFactor ); virtual Reference cloneNoMore(); diff --git a/fdbserver/LogSystemConfig.h b/fdbserver/LogSystemConfig.h index 3c24dc84b5..6890726579 100644 --- a/fdbserver/LogSystemConfig.h +++ b/fdbserver/LogSystemConfig.h @@ -61,7 +61,7 @@ struct TLogSet { int32_t tLogWriteAntiQuorum, tLogReplicationFactor; std::vector< LocalityData > tLogLocalities; // Stores the localities of the log servers TLogVersion tLogVersion; - IRepPolicyRef tLogPolicy; + Reference tLogPolicy; bool isLocal; int8_t locality; Version startVersion; diff --git a/fdbserver/LogSystemPeekCursor.actor.cpp b/fdbserver/LogSystemPeekCursor.actor.cpp index 1351fb207f..feb4a2e8a0 100644 --- a/fdbserver/LogSystemPeekCursor.actor.cpp +++ b/fdbserver/LogSystemPeekCursor.actor.cpp @@ -273,7 +273,7 @@ ILogSystem::MergedPeekCursor::MergedPeekCursor( vector< Reference>>> const& logServers, int bestServer, int readQuorum, Tag tag, Version begin, Version end, - bool parallelGetMore, std::vector< LocalityData > const& tLogLocalities, IRepPolicyRef const tLogPolicy, int tLogReplicationFactor ) + bool parallelGetMore, std::vector< LocalityData > const& tLogLocalities, Reference const tLogPolicy, int tLogReplicationFactor ) : bestServer(bestServer), readQuorum(readQuorum), tag(tag), currentCursor(0), hasNextMessage(false), messageVersion(begin), randomID(g_random->randomUniqueID()), tLogReplicationFactor(tLogReplicationFactor) { if(tLogPolicy) { logSet = Reference( new LogSet() ); diff --git a/fdbserver/TagPartitionedLogSystem.actor.cpp b/fdbserver/TagPartitionedLogSystem.actor.cpp index 262d5a3449..3ecaa26e16 100644 --- a/fdbserver/TagPartitionedLogSystem.actor.cpp +++ b/fdbserver/TagPartitionedLogSystem.actor.cpp @@ -531,12 +531,12 @@ struct TagPartitionedLogSystem : ILogSystem, ReferenceCounted= lastBegin) { TraceEvent("TLogPeekRemoteBestOnly", dbgid).detail("Tag", tag.toString()).detail("Begin", begin).detail("BestSet", bestSet).detail("BestSetStart", lastBegin).detail("LogRouterIds", tLogs[bestSet]->logRouterString()); - return Reference( new ILogSystem::MergedPeekCursor( tLogs[bestSet]->logRouters, -1, (int)tLogs[bestSet]->logRouters.size(), tag, begin, getPeekEnd(), false, std::vector(), IRepPolicyRef(), 0 ) ); + return Reference( new ILogSystem::MergedPeekCursor( tLogs[bestSet]->logRouters, -1, (int)tLogs[bestSet]->logRouters.size(), tag, begin, getPeekEnd(), false, std::vector(), Reference(), 0 ) ); } else { std::vector< Reference > cursors; std::vector< LogMessageVersion > epochEnds; TraceEvent("TLogPeekRemoteAddingBest", dbgid).detail("Tag", tag.toString()).detail("Begin", begin).detail("BestSet", bestSet).detail("BestSetStart", lastBegin).detail("LogRouterIds", tLogs[bestSet]->logRouterString()); - cursors.push_back( Reference( new ILogSystem::MergedPeekCursor( tLogs[bestSet]->logRouters, -1, (int)tLogs[bestSet]->logRouters.size(), tag, lastBegin, getPeekEnd(), false, std::vector(), IRepPolicyRef(), 0 ) ) ); + cursors.push_back( Reference( new ILogSystem::MergedPeekCursor( tLogs[bestSet]->logRouters, -1, (int)tLogs[bestSet]->logRouters.size(), tag, lastBegin, getPeekEnd(), false, std::vector(), Reference(), 0 ) ) ); int i = 0; while(begin < lastBegin) { if(i == oldLogData.size()) { @@ -565,7 +565,7 @@ struct TagPartitionedLogSystem : ILogSystem, ReferenceCountedlogRouterString()) .detail("LastBegin", lastBegin).detail("ThisBegin", thisBegin).detail("BestStartVer", oldLogData[i].tLogs[bestOldSet]->startVersion); cursors.push_back( Reference( new ILogSystem::MergedPeekCursor( oldLogData[i].tLogs[bestOldSet]->logRouters, -1, (int)oldLogData[i].tLogs[bestOldSet]->logRouters.size(), tag, - thisBegin, lastBegin, false, std::vector(), IRepPolicyRef(), 0 ) ) ); + thisBegin, lastBegin, false, std::vector(), Reference(), 0 ) ) ); epochEnds.push_back(LogMessageVersion(lastBegin)); lastBegin = thisBegin; } @@ -1566,7 +1566,7 @@ struct TagPartitionedLogSystem : ILogSystem, ReferenceCounted recruitOldLogRouters( TagPartitionedLogSystem* self, vector workers, LogEpoch recoveryCount, int8_t locality, Version startVersion, - std::vector tLogLocalities, IRepPolicyRef tLogPolicy, bool forRemote ) { + std::vector tLogLocalities, Reference tLogPolicy, bool forRemote ) { state vector>> logRouterInitializationReplies; state vector> allReplies; int nextRouter = 0; diff --git a/fdbserver/WorkerInterface.actor.h b/fdbserver/WorkerInterface.actor.h index eef28a8cfe..12481e0596 100644 --- a/fdbserver/WorkerInterface.actor.h +++ b/fdbserver/WorkerInterface.actor.h @@ -120,7 +120,7 @@ struct InitializeLogRouterRequest { Tag routerTag; Version startVersion; std::vector tLogLocalities; - IRepPolicyRef tLogPolicy; + Reference tLogPolicy; int8_t locality; ReplyPromise reply; From c32504f7059789da7827e872afe83173d4618218 Mon Sep 17 00:00:00 2001 From: Vishesh Yadav Date: Tue, 12 Mar 2019 14:04:18 -0700 Subject: [PATCH 41/46] io: Add DISABLE_POSIX_KERNEL_AIO knob to use EIO instead of Kernel AIO - Some Linux filesystems don't support O_DIRECT which is required by Kernel AIO to function properly. Instead of using O_SYNC, EIO is much better options in terms of performance penalty. - Some systems may not support AIO at all. Eg. Windows Subsystem for Linux. FIXES #842 RELATED #274 --- fdbrpc/Net2FileSystem.cpp | 7 ++++++- flow/Knobs.cpp | 1 + flow/Knobs.h | 1 + 3 files changed, 8 insertions(+), 1 deletion(-) diff --git a/fdbrpc/Net2FileSystem.cpp b/fdbrpc/Net2FileSystem.cpp index 25c703e1bb..31ce9f6095 100644 --- a/fdbrpc/Net2FileSystem.cpp +++ b/fdbrpc/Net2FileSystem.cpp @@ -58,7 +58,12 @@ Future< Reference > Net2FileSystem::open( std::string filename Future> f; #ifdef __linux__ - if ( (flags & IAsyncFile::OPEN_UNBUFFERED) && !(flags & IAsyncFile::OPEN_NO_AIO) ) + // In the vast majority of cases, we wish to use Kernel AIO. However, some systems + // dont properly support don’t properly support kernel async I/O without O_DIRECT + // or AIO at all. In such cases, DISABLE_POSIX_KERNEL_AIO knob can be enabled to fallback to + // EIO instead of Kernel AIO. + if ((flags & IAsyncFile::OPEN_UNBUFFERED) && !(flags & IAsyncFile::OPEN_NO_AIO) && + !FLOW_KNOBS->DISABLE_POSIX_KERNEL_AIO) f = AsyncFileKAIO::open(filename, flags, mode, NULL); else #endif diff --git a/flow/Knobs.cpp b/flow/Knobs.cpp index 98f472a0bc..00151c43ed 100644 --- a/flow/Knobs.cpp +++ b/flow/Knobs.cpp @@ -79,6 +79,7 @@ FlowKnobs::FlowKnobs(bool randomize, bool isSimulated) { init( MIN_SUBMIT, 10 ); init( PAGE_WRITE_CHECKSUM_HISTORY, 0 ); if( randomize && BUGGIFY ) PAGE_WRITE_CHECKSUM_HISTORY = 10000000; + init( DISABLE_POSIX_KERNEL_AIO, 0 ); //AsyncFileNonDurable init( MAX_PRIOR_MODIFICATION_DELAY, 1.0 ); if( randomize && BUGGIFY ) MAX_PRIOR_MODIFICATION_DELAY = 10.0; diff --git a/flow/Knobs.h b/flow/Knobs.h index 22077dd6f1..e53120981f 100644 --- a/flow/Knobs.h +++ b/flow/Knobs.h @@ -98,6 +98,7 @@ public: int MIN_SUBMIT; int PAGE_WRITE_CHECKSUM_HISTORY; + int DISABLE_POSIX_KERNEL_AIO; //AsyncFileNonDurable double MAX_PRIOR_MODIFICATION_DELAY; From d073b08a0a6239753448fb1e4f5aeae5f54c199e Mon Sep 17 00:00:00 2001 From: Vishesh Yadav Date: Wed, 13 Mar 2019 11:47:02 -0700 Subject: [PATCH 42/46] Update DIABLE_POSIX_KERNEL_AIO entry in 6.1 release notes --- documentation/sphinx/source/release-notes.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/documentation/sphinx/source/release-notes.rst b/documentation/sphinx/source/release-notes.rst index efa6c17c64..faec3bdde4 100644 --- a/documentation/sphinx/source/release-notes.rst +++ b/documentation/sphinx/source/release-notes.rst @@ -33,7 +33,7 @@ Fixes ----- * Python: Creating a ``SingleFloat`` for the tuple layer didn't work with integers. `(PR #1216) `_ -* Added `USE_EIO_FILE` knob to fallback to libeio instead of kernel async I/O (KAIO) for systems that do not support KAIO or O_DIRECT flag. `(PR #1283) https://github.com/apple/foundationdb/pull/1283`_ +* Added `DISABLE_POSIX_KERNEL_AIO` knob to fallback to libeio instead of kernel async I/O (KAIO) for systems that do not support KAIO or O_DIRECT flag. `(PR #1283) https://github.com/apple/foundationdb/pull/1283`_ Status ------ From 7f480253486939e3921457e3d8d64764f0566a0b Mon Sep 17 00:00:00 2001 From: Evan Tschannen Date: Wed, 13 Mar 2019 14:47:17 -0700 Subject: [PATCH 43/46] optimize confirm epoch alive --- fdbserver/TagPartitionedLogSystem.actor.cpp | 17 +++++------------ 1 file changed, 5 insertions(+), 12 deletions(-) diff --git a/fdbserver/TagPartitionedLogSystem.actor.cpp b/fdbserver/TagPartitionedLogSystem.actor.cpp index 3ecaa26e16..87da4bcfc1 100644 --- a/fdbserver/TagPartitionedLogSystem.actor.cpp +++ b/fdbserver/TagPartitionedLogSystem.actor.cpp @@ -959,24 +959,17 @@ struct TagPartitionedLogSystem : ILogSystem, ReferenceCountedtLogReplicationFactor, numPresent - logSet->tLogWriteAntiQuorum) ) ); - state Reference locked(new LocalityGroup()); - state std::vector responded(alive.size()); - for (int i = 0; i < alive.size(); i++) { - responded[i] = false; - } + state std::vector locked; + state std::vector responded(alive.size(), false); loop { for (int i = 0; i < alive.size(); i++) { if (!responded[i] && alive[i].isReady() && !alive[i].isError()) { - locked->add(logSet->tLogLocalities[i]); + locked.push_back(logSet->logEntryArray[i]); responded[i] = true; } } - bool quorum_obtained = locked->validate(logSet->tLogPolicy); - // We intentionally skip considering antiquorums, as the CPU cost of doing so is prohibitive. - if (logSet->tLogReplicationFactor == 1 && locked->size() > 0) { - ASSERT(quorum_obtained); - } - if (quorum_obtained) { + + if (logSet->satisfiesPolicy(locked)) { return Void(); } From e8cb85ed8e190ead1c8af1e10f5bd6ee6dacd012 Mon Sep 17 00:00:00 2001 From: Evan Tschannen Date: Wed, 13 Mar 2019 14:47:35 -0700 Subject: [PATCH 44/46] optimize validateAllCombinations --- fdbrpc/ReplicationUtils.cpp | 32 +++++++++++++++++++++++--------- 1 file changed, 23 insertions(+), 9 deletions(-) diff --git a/fdbrpc/ReplicationUtils.cpp b/fdbrpc/ReplicationUtils.cpp index d2c7e734a0..f4fd1770f7 100644 --- a/fdbrpc/ReplicationUtils.cpp +++ b/fdbrpc/ReplicationUtils.cpp @@ -287,24 +287,38 @@ bool validateAllCombinations( else { bool bIsValidGroup; - LocalityGroup localityGroup; - std::string bitmask(nCombinationSize, 1); // K leading 1's + Reference localSet = Reference( new LocalityGroup() ); + LocalityGroup* localGroup = (LocalityGroup*) localSet.getPtr(); + localGroup->deep_copy(localitySet); + std::vector originalEntries = localGroup->getEntries(); + + for (int i = 0; i < newItems.size(); ++i) { + localGroup->add(newItems[i]); + } + + std::string bitmask(nCombinationSize, 1); // K leading 1's bitmask.resize(newItems.size(), 0); // N-K trailing 0's + std::vector localityGroupEntries; + std::vector resultEntries; do { - localityGroup.deep_copy(localitySet); - + localityGroupEntries = originalEntries; // [0..N-1] integers - for (int i = 0; i < newItems.size(); ++i) { + for (int i = 0; i < bitmask.size(); ++i) { if (bitmask[i]) { - localityGroup.add(newItems[i]); + localityGroupEntries.push_back(localGroup->getEntry(originalEntries.size() + i)); } } - // Check if the group combination passes validation - bIsValidGroup = localityGroup.validate(policy); + resultEntries.clear(); + + // Run the policy, assert if unable to satisfy + bool result = localSet->selectReplicas(policy, localityGroupEntries, resultEntries); + ASSERT(result); + + bIsValidGroup = resultEntries.size() == 0; if (((bCheckIfValid) && (!bIsValidGroup) ) || @@ -319,7 +333,7 @@ bool validateAllCombinations( } if (g_replicationdebug > 2) { printf("Invalid group\n"); - localityGroup.DisplayEntries(); + localGroup->DisplayEntries(); } if (g_replicationdebug > 3) { printf("Full set\n"); From e7d1f9e5f14d8725ff8dbdee9a705d92e1e389d5 Mon Sep 17 00:00:00 2001 From: Evan Tschannen Date: Wed, 13 Mar 2019 15:59:03 -0700 Subject: [PATCH 45/46] fixed review comments --- fdbrpc/ReplicationUtils.cpp | 8 ++++---- fdbserver/TagPartitionedLogSystem.actor.cpp | 6 +++--- 2 files changed, 7 insertions(+), 7 deletions(-) diff --git a/fdbrpc/ReplicationUtils.cpp b/fdbrpc/ReplicationUtils.cpp index f4fd1770f7..6c6099c107 100644 --- a/fdbrpc/ReplicationUtils.cpp +++ b/fdbrpc/ReplicationUtils.cpp @@ -291,7 +291,8 @@ bool validateAllCombinations( LocalityGroup* localGroup = (LocalityGroup*) localSet.getPtr(); localGroup->deep_copy(localitySet); - std::vector originalEntries = localGroup->getEntries(); + std::vector localityGroupEntries = localGroup->getEntries(); + int originalSize = localityGroupEntries.size(); for (int i = 0; i < newItems.size(); ++i) { localGroup->add(newItems[i]); @@ -300,15 +301,14 @@ bool validateAllCombinations( std::string bitmask(nCombinationSize, 1); // K leading 1's bitmask.resize(newItems.size(), 0); // N-K trailing 0's - std::vector localityGroupEntries; std::vector resultEntries; do { - localityGroupEntries = originalEntries; + localityGroupEntries.resize(originalSize); // [0..N-1] integers for (int i = 0; i < bitmask.size(); ++i) { if (bitmask[i]) { - localityGroupEntries.push_back(localGroup->getEntry(originalEntries.size() + i)); + localityGroupEntries.push_back(localGroup->getEntry(originalSize + i)); } } diff --git a/fdbserver/TagPartitionedLogSystem.actor.cpp b/fdbserver/TagPartitionedLogSystem.actor.cpp index 87da4bcfc1..0b1c4787bd 100644 --- a/fdbserver/TagPartitionedLogSystem.actor.cpp +++ b/fdbserver/TagPartitionedLogSystem.actor.cpp @@ -959,17 +959,17 @@ struct TagPartitionedLogSystem : ILogSystem, ReferenceCountedtLogReplicationFactor, numPresent - logSet->tLogWriteAntiQuorum) ) ); - state std::vector locked; + state std::vector aliveEntries; state std::vector responded(alive.size(), false); loop { for (int i = 0; i < alive.size(); i++) { if (!responded[i] && alive[i].isReady() && !alive[i].isError()) { - locked.push_back(logSet->logEntryArray[i]); + aliveEntries.push_back(logSet->logEntryArray[i]); responded[i] = true; } } - if (logSet->satisfiesPolicy(locked)) { + if (logSet->satisfiesPolicy(aliveEntries)) { return Void(); } From 529068c3e20ad844c4d2b98f44498bebe6c796e4 Mon Sep 17 00:00:00 2001 From: Vishesh Yadav Date: Wed, 13 Mar 2019 15:34:52 -0700 Subject: [PATCH 46/46] doc: Live TLS migration --- documentation/sphinx/source/tls.rst | 48 +++++++++++++++++++++++++++-- 1 file changed, 45 insertions(+), 3 deletions(-) diff --git a/documentation/sphinx/source/tls.rst b/documentation/sphinx/source/tls.rst index 1884622498..f5b5c94852 100644 --- a/documentation/sphinx/source/tls.rst +++ b/documentation/sphinx/source/tls.rst @@ -29,10 +29,52 @@ This will configure the new cluster to communicate with TLS. .. note:: Depending on your operating system, version and configuration, there may be a firewall in place that prevents external access to certain ports. If necessary, please consult the appropriate documentation for your OS and ensure that all machines in your cluster can reach the ports configured in your :ref:`configuration file `. -.. _converting-existing-cluster: +.. _converting-existing-cluster-after-6.1: -Converting an existing cluster to use TLS -========================================= +Converting an existing cluster to use TLS (since v6.1) +====================================================== + +Since version 6.1, FoundationDB clusters can be converted to TLS without downtime. FoundationDB server can listen to TLS and unencrypted traffic simultaneously on two separate ports. As a result, FDB clusters can live migrate to TLS: + +1) Restart each FoundationDB server individually, but with an additional listen address for TLS traffic:: + + /path/to/fdbserver -C fdb.cluster -p 127.0.0.1:4500 -p 127.0.0.1:4600:tls + + Since, the server still listens to unencrypted traffic and the cluster file still contains the old address, rest of the processes will be able to talk to this new process. + +2) Once all processes are listening to both TLS and unencrypted traffic, switch one or more coordinator to use TLS. Therefore, if the old coordinator list was ``127.0.0.1:4500,127.0.0.1:4501,127.0.0.1:4502``, the new one would be something like ``127.0.0.1:4600:tls,127.0.0.1:4501,127.0.0.1:4502``. Switching few coordinators to TLS at a time allows a smoother migration and a window to find out clients who do not yet have TLS configured. The number of coordinators each client can connect to can be seen via ``fdbstatus`` (look for ``connected_coordinators`` field in ``clients``):: + + "clients" : { + "count" : 2, + "supported_versions" : [ + { + "client_version" : "6.1.0", + "connected_clients" : [ + { + "address" : "127.0.0.1:42916", + "connected_coordinators": 3, + "log_group" : "default" + }, + { + "address" : "127.0.0.1:42918", + "connected_coordinators": 2, + "log_group" : "default" + } + ] + }, ... + ] + } + +3) If there exist a client (e.g., the client 127.0.0.1:42918 in the above example) that cannot connect to all coordinators after a coordinator is switched to TLS, it mean the client does not set up its TLS correctly. System operator should notify the client to correct the client's TLS configuration. Otherwise, when all coordinators are switched to TLS ports, the client will loose connection. + +4) Repeat (2) and (3) until all the addresses in coordinator list are TLS. + +5) Restart each FoundationDB server, but only with one public address that listens to TLS traffic only. + +.. _converting-existing-cluster-before-6.1: + +Converting an existing cluster to use TLS (< v6.1) +================================================== Enabling TLS on an existing (non-TLS) cluster cannot be accomplished without downtime because all processes must have TLS enabled to communicate. At startup, each server process enables TLS if the addresses in its cluster file are TLS-enabled. As a result, server processes must be stopped and restarted to convert them to use TLS. To convert the cluster to TLS in the most conservative way: