From 72de765083daf0d2ef25bda13a09763e92e390ff Mon Sep 17 00:00:00 2001 From: Ben Collins Date: Thu, 13 Jul 2017 08:18:00 -0700 Subject: [PATCH 01/30] Remove unused code --- flow/Net2.actor.cpp | 119 -------------------------------------------- 1 file changed, 119 deletions(-) diff --git a/flow/Net2.actor.cpp b/flow/Net2.actor.cpp index 28c9a74e43..89fbbdb93e 100644 --- a/flow/Net2.actor.cpp +++ b/flow/Net2.actor.cpp @@ -982,122 +982,3 @@ void startThreadF( F && func ) { Thing* t = new Thing(std::move(func)); startThread(Thing::start, t); } - -void net2_test() { - /*printf("ThreadSafeQueue test\n"); - printf(" Interface: "); - ThreadSafeQueue tq; - ASSERT( tq.canSleep() == true ); - - ASSERT( tq.push( 1 ) == true ) ; - ASSERT( tq.push( 2 ) == false ); - ASSERT( tq.push( 3 ) == false ); - - ASSERT( tq.pop().get() == 1 ); - ASSERT( tq.pop().get() == 2 ); - ASSERT( tq.push( 4 ) == false ); - ASSERT( tq.pop().get() == 3 ); - ASSERT( tq.pop().get() == 4 ); - ASSERT( !tq.pop().present() ); - printf("OK\n"); - - printf("Threaded: "); - Event finished, finished2; - int thread1Iterations = 1000000, thread2Iterations = 100000; - - if (thread1Iterations) - startThreadF([&](){ - printf("Thread1\n"); - for(int i=0; i i = tq.pop(); - if (i.present()) { - int v = i.get(); - ++c; - if (mx[v>>20] != v) - printf("Wrong value dequeued!\n"); - ASSERT( mx[v>>20] == v ); - mx[v>>20] = v + 1; - } else { - ++p; - _mm_pause(); - } - if ((c&3)==0) tq.canSleep(); - } - printf("%d %d %x %x %s\n", c, p, mx[0], mx[1], mx[0]==thread1Iterations && mx[1]==(1<<20)+thread2Iterations ? "OK" : "FAIL"); - - finished.block(); - finished2.block(); - - - g_network = newNet2(NetworkAddress::parse("127.0.0.1:12345")); // for promise serialization below - - Endpoint destination; - - printf(" Used: %lld\n", FastAllocator<4096>::getMemoryUsed()); - - char junk[100]; - - double before = timer(); - - vector reqs; - reqs.reserve( 10000 ); - - int totalBytes = 0; - for(int j=0; j<1000; j++) { - UnsentPacketQueue unsent; - ReliablePacketList reliable; - - reqs.resize(10000); - for(int i=0; i<10000; i++) { - TestGVR &req = reqs[i]; - req.key = LiteralStringRef("Foobar"); - - SerializeSource what(req); - - SendBuffer* pb = unsent.getWriteBuffer(); - ReliablePacket* rp = new ReliablePacket; // 0 - - PacketWriter wr(pb,rp,AssumeVersion(currentProtocolVersion)); - //BinaryWriter wr; - SplitBuffer packetLen; - uint32_t len = 0; - wr.writeAhead(sizeof(len), &packetLen); - wr << destination.token; - //req.reply.getEndpoint(); - what.serializePacketWriter(wr); - //wr.serializeBytes(junk, 43); - - unsent.setWriteBuffer(wr.finish()); - len = wr.size() - sizeof(len); - packetLen.write(&len, sizeof(len)); - - //totalBytes += wr.getLength(); - totalBytes += wr.size(); - - if (rp) reliable.insert(rp); - } - reqs.clear(); - unsent.discardAll(); - reliable.discardAll(); - } - - printf("SimSend x 1Kx10K: %0.2f sec\n", timer()-before); - printf(" Bytes: %d\n", totalBytes); - printf(" Used: %lld\n", FastAllocator<4096>::getMemoryUsed()); - */ -}; From 272b4b984c4d1dbf0c77717165beea2172043b03 Mon Sep 17 00:00:00 2001 From: Evan Tschannen Date: Fri, 25 Aug 2017 10:12:58 -0700 Subject: [PATCH 02/30] fix: fixed a rare bug where we do not wait for a file in the process of being deleted to shutdown before rebooting a machine --- fdbrpc/AsyncFileNonDurable.actor.h | 56 +++++++++++-------- fdbrpc/sim2.actor.cpp | 27 ++++++--- fdbrpc/simulator.h | 1 + fdbserver/DiskQueue.actor.cpp | 4 +- fdbserver/SimulatedCluster.actor.cpp | 5 ++ fdbserver/TLogServer.actor.cpp | 2 +- .../workloads/ConsistencyCheck.actor.cpp | 2 +- 7 files changed, 63 insertions(+), 34 deletions(-) diff --git a/fdbrpc/AsyncFileNonDurable.actor.h b/fdbrpc/AsyncFileNonDurable.actor.h index 9b26bbc3ae..912beb5e9c 100644 --- a/fdbrpc/AsyncFileNonDurable.actor.h +++ b/fdbrpc/AsyncFileNonDurable.actor.h @@ -213,8 +213,9 @@ public: //If we are in the process of deleting a file, we can't let someone else modify it at the same time. We therefore block the creation of new files until deletion is complete state std::map>::iterator deletedFile = filesBeingDeleted.find(filename); if(deletedFile != filesBeingDeleted.end()) { - //TraceEvent("AsyncFileNonDurableOpenWaitOnDelete").detail("Filename", filename); + //TraceEvent("AsyncFileNonDurableOpenWaitOnDelete1").detail("Filename", filename); Void _ = wait( deletedFile->second || shutdown ); + //TraceEvent("AsyncFileNonDurableOpenWaitOnDelete2").detail("Filename", filename); if(shutdown.isReady()) throw io_error().asInjectedFault(); } @@ -711,35 +712,44 @@ private: //Finishes all outstanding actors on an AsyncFileNonDurable and then deletes it ACTOR Future deleteFile(AsyncFileNonDurable *self) { - //We must run on the main thread (instead of a SQLite coroutine). We don't want to signal any promises from a coroutine, so we switch at the beginning - //of this ACTOR - Void _ = wait(self->returnToMainThread()); + state ISimulator::ProcessInfo* currentProcess = g_simulator.getCurrentProcess(); + state int currentTaskID = g_network->getCurrentTask(); + state std::string filename = self->filename; - //Make sure all writes have gone through. - Promise startSyncPromise = self->startSyncPromise; - self->startSyncPromise = Promise(); - startSyncPromise.send(true); + Void _ = wait( g_simulator.onMachine( currentProcess ) ); + try { + //Make sure all writes have gone through. + Promise startSyncPromise = self->startSyncPromise; + self->startSyncPromise = Promise(); + startSyncPromise.send(true); - std::vector> outstandingModifications; + std::vector> outstandingModifications; - for(auto itr = self->pendingModifications.ranges().begin(); itr != self->pendingModifications.ranges().end(); ++itr) - if(itr->value().isValid() && !itr->value().isReady()) - outstandingModifications.push_back(itr->value()); + for(auto itr = self->pendingModifications.ranges().begin(); itr != self->pendingModifications.ranges().end(); ++itr) + if(itr->value().isValid() && !itr->value().isReady()) + outstandingModifications.push_back(itr->value()); - //Ignore errors here so that all modifications can finish - Void _ = wait(waitForAllReady(outstandingModifications)); + //Ignore errors here so that all modifications can finish + Void _ = wait(waitForAllReady(outstandingModifications)); - //Make sure we aren't in the process of killing the file - if(self->killed.isSet()) - Void _ = wait(self->killComplete.getFuture()); + //Make sure we aren't in the process of killing the file + if(self->killed.isSet()) + Void _ = wait(self->killComplete.getFuture()); - //Remove this file from the filesBeingDeleted map so that new files can be created with this filename - g_simulator.getMachineByNetworkAddress( self->openedAddress )->closingFiles.erase(self->getFilename()); - AsyncFileNonDurable::filesBeingDeleted.erase(self->filename); - //TraceEvent("AsyncFileNonDurable_FinishDelete", self->id).detail("Filename", self->filename); + //Remove this file from the filesBeingDeleted map so that new files can be created with this filename + g_simulator.getMachineByNetworkAddress( self->openedAddress )->closingFiles.erase(self->getFilename()); + g_simulator.getMachineByNetworkAddress( self->openedAddress )->deletingFiles.erase(self->getFilename()); + AsyncFileNonDurable::filesBeingDeleted.erase(self->filename); + //TraceEvent("AsyncFileNonDurable_FinishDelete", self->id).detail("Filename", self->filename); - delete self; - return Void(); + delete self; + Void _ = wait( g_simulator.onProcess( currentProcess, currentTaskID ) ); + return Void(); + } catch( Error &e ) { + state Error err = e; + Void _ = wait( g_simulator.onProcess( currentProcess, currentTaskID ) ); + throw err; + } } }; diff --git a/fdbrpc/sim2.actor.cpp b/fdbrpc/sim2.actor.cpp index f2d4b11e95..bbb094f93e 100644 --- a/fdbrpc/sim2.actor.cpp +++ b/fdbrpc/sim2.actor.cpp @@ -886,16 +886,29 @@ public: // This is a _rudimentary_ simulation of the untrustworthiness of non-durable deletes and the possibility of // rebooting during a durable one. It isn't perfect: for example, on real filesystems testing // for the existence of a non-durably deleted file BEFORE a reboot will show that it apparently doesn't exist. - g_simulator.getCurrentProcess()->machine->openFiles.erase(filename); + if(g_simulator.getCurrentProcess()->machine->openFiles.count(filename)) { + g_simulator.getCurrentProcess()->machine->openFiles.erase(filename); + g_simulator.getCurrentProcess()->machine->deletingFiles.insert(filename); + } if ( mustBeDurable || g_random->random01() < 0.5 ) { - Void _ = wait( ::delay(0.05 * g_random->random01()) ); - if (!self->getCurrentProcess()->rebooting) { - auto f = IAsyncFileSystem::filesystem(self->net2)->deleteFile(filename, false); - ASSERT( f.isReady() ); + state ISimulator::ProcessInfo* currentProcess = g_simulator.getCurrentProcess(); + state int currentTaskID = g_network->getCurrentTask(); + Void _ = wait( g_simulator.onMachine( currentProcess ) ); + try { Void _ = wait( ::delay(0.05 * g_random->random01()) ); - TEST( true ); // Simulated durable delete + if (!currentProcess->rebooting) { + auto f = IAsyncFileSystem::filesystem(self->net2)->deleteFile(filename, false); + ASSERT( f.isReady() ); + Void _ = wait( ::delay(0.05 * g_random->random01()) ); + TEST( true ); // Simulated durable delete + } + Void _ = wait( g_simulator.onProcess( currentProcess, currentTaskID ) ); + return Void(); + } catch( Error &e ) { + state Error err = e; + Void _ = wait( g_simulator.onProcess( currentProcess, currentTaskID ) ); + throw err; } - return Void(); } else { TEST( true ); // Simulated non-durable delete return Void(); diff --git a/fdbrpc/simulator.h b/fdbrpc/simulator.h index 310b17bf5f..e72c1505ad 100644 --- a/fdbrpc/simulator.h +++ b/fdbrpc/simulator.h @@ -108,6 +108,7 @@ public: ProcessInfo* machineProcess; std::vector processes; std::map>> openFiles; + std::set deletingFiles; std::set closingFiles; Optional> zoneId; diff --git a/fdbserver/DiskQueue.actor.cpp b/fdbserver/DiskQueue.actor.cpp index 83ef757179..9afd8d983b 100644 --- a/fdbserver/DiskQueue.actor.cpp +++ b/fdbserver/DiskQueue.actor.cpp @@ -419,8 +419,8 @@ public: } if( error.code() != error_code_actor_cancelled ) { - if (!self->stopped.isSet()) self->stopped.send(Void()); - if (!self->error.isSet()) self->error.send(Never()); + if (self->stopped.canBeSet()) self->stopped.send(Void()); + if (self->error.canBeSet()) self->error.send(Never()); delete self; } } diff --git a/fdbserver/SimulatedCluster.actor.cpp b/fdbserver/SimulatedCluster.actor.cpp index b774cf5beb..2e872a68d7 100644 --- a/fdbserver/SimulatedCluster.actor.cpp +++ b/fdbserver/SimulatedCluster.actor.cpp @@ -442,6 +442,11 @@ ACTOR Future simulatedMachine( ASSERT( it.second.isReady() && !it.second.isError() ); } + for( auto it : g_simulator.getMachineById(localities.zoneId())->deletingFiles ) { + filenames.insert( it ); + closingStr += it + ", "; + } + TraceEvent("SimulatedMachineRebootAfterKills") .detail("Folder0", myFolders[0]) .detail("CFolder0", coordFolders[0]) diff --git a/fdbserver/TLogServer.actor.cpp b/fdbserver/TLogServer.actor.cpp index 109467341c..4184d21ada 100644 --- a/fdbserver/TLogServer.actor.cpp +++ b/fdbserver/TLogServer.actor.cpp @@ -1745,7 +1745,7 @@ ACTOR Future tLog( IKeyValueStore* persistentData, IDiskQueue* persistentQ } } } catch (Error& e) { - TraceEvent("TLogError", tlogId).error(e); + TraceEvent("TLogError", tlogId).error(e, true); while(!tlogRequests.isEmpty()) { tlogRequests.getFuture().pop().reply.sendError(recruitment_failed()); } diff --git a/fdbserver/workloads/ConsistencyCheck.actor.cpp b/fdbserver/workloads/ConsistencyCheck.actor.cpp index 1341e131f4..9f04f566e8 100644 --- a/fdbserver/workloads/ConsistencyCheck.actor.cpp +++ b/fdbserver/workloads/ConsistencyCheck.actor.cpp @@ -1086,7 +1086,7 @@ struct ConsistencyCheckWorkload : TestWorkload for(auto id : stores.get()) { if(!statefulProcesses[itr->first.address()].count(id)) { TraceEvent("ConsistencyCheck_ExtraDataStore").detail("Address", itr->first.address()).detail("DataStoreID", id); - if(g_network->isSimulated()) { + if(g_network->isSimulated() && !g_simulator.speedUpSimulation) { g_simulator.rebootProcess(g_simulator.getProcessByAddress(itr->first.address()), ISimulator::RebootProcess); } From 9be634858056574569af98b8f9b5196bd6edf9a2 Mon Sep 17 00:00:00 2001 From: Alec Grieser Date: Fri, 25 Aug 2017 11:30:46 -0700 Subject: [PATCH 03/30] added tuple spec to design section of repo --- design/tuple.md | 192 ++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 192 insertions(+) create mode 100644 design/tuple.md diff --git a/design/tuple.md b/design/tuple.md new file mode 100644 index 0000000000..b1c82f5250 --- /dev/null +++ b/design/tuple.md @@ -0,0 +1,192 @@ +# FDB Tuple layer typecodes + +This document is intended to be the system of record for the allocation of typecodes in the Tuple layer. The source code isn’t good enough because a typecode might be added to one language (or by a customer) before another. + +Status: Standard means that all of our language bindings implement this typecode +Status: Reserved means that this typecode is not yet used in our standard language bindings, but may be in use by third party bindings or specific applications +Status: Deprecated means that a previous layer used this type, but issues with that type code have led us to mark this type code as not to be used. + + +### **Null Value** + +Typecode: 0x00 +Length: 0 bytes +Status: Standard + +### **Byte String** + +Typecode: 0x01 +Length: Variable (terminated by` [\x00]![\xff]`) +Encoding: `b'\x01' + value.replace(b'\x00', b'\x00\xFF') + b'\x00'` +Test case: `pack(“foo\x00bar”) == b'\x01foo\x00\xffbar\x00'` +Status: Standard + +In other words, byte strings are null terminated with null values occurring in the string escaped in an order-preserving way. + +### **Unicode String** + +Typecode: 0x02 +Length: Variable (terminated by [\x00]![\xff]) +Encoding: `b'\x02' + value.encode('utf-8').replace(b'\x00', b'\x00\xFF') + b'\x00'` +Test case: `pack( u"F\u00d4O\u0000bar" ) == b'\x02F\xc3\x94O\x00\xffbar\x00'` +Status: Standard + +This is the same way that byte strings are encoded, but first, the unicode string is encoded in UTF-8. + +### **(DEBRECATED) Nested Tuple** + +Typecodes: 0x03-0x04 +Length: Variable (terminated by 0x04 type code) +Status: Deprecated +This encoding was used by a few layers. However, it had ordering problems when one tuple was a prefix of another and the type of the first element in the longer tuple was either null or a byte string. For an example, consider the empty tuple and the tuple containing only null. In the old scheme, the empty tuple would be encoded as `\x03\x04` while the tuple containing only null would be encoded as `\x03\x00\x04`, so the second tuple would sort first based on their bytes, which is incorrect semantically. + +### **Nested Tuple** + +Typecodes: 0x05 +Length: Variable (terminated by `[\x00]![\xff]` at beginning of nested element) +Encoding: `b'\x05' + ''.join(map(lambda x: b'\x00\xff' if x is None else pack(x), value)) + b'\x00'` +Test case: `pack( (“foo\x00bar”, None, ()) ) == b'\x05\x01foo\x00\xffbar\x00\x00\xff\x05\x00\x00'` +Status: Standard + + The list is ended with a 0x00 byte. Nulls within the tuple are encoded as `\x00\xff`. There is no other null escaping. In particular, 0x00 bytes that are within the nested types can be left as-is as they are passed over when decoding the interior types. To show how this fixes the bug in the previous version of nested tuples, the empty tuple is now encoded as `\x05\x00` while the tuple containing only null is encoded as `\x05\x00\xff\x00`, so the first tuple will sort first. + +### **Negative arbitrary-precision Integer** + +Typecodes: 0x0a, 0x0b +Encoding: Not defined yet +Status: Reserved; 0x0b used in Python and Java + +These typecodes are reserved for encoding integers larger than 8 bytes. Presumably the type code would be followed by some encoding of the length, followed by the big endian one’s complement number. Reserving two typecodes for each of positive and negative numbers is probably overkill, but until there’s a design in place we might as well not use them. In the Python and Java implementations, 0x0b stores negative numbers which are expressed with between 9 and 255 bytes. The first byte following the type code (0x0b) is a single byte expressing the number of bytes in the integer (with its bits flipped to preserve order), followed by that number of bytes representing the number in big endian order in one's complement. + +### **Integer** + +Typecodes: 0x0c - 0x1c + 0x0c is an 8 byte negative number + 0x13 is a 1 byte negative number + 0x14 is a zero + 0x15 is a 1 byte positive number + 0x1c is an 8 byte positive number +Length: Depends on typecode (0-8 bytes) +Encoding: positive numbers are big endian + negative numbers are big endian one’s complement (so -1 is 0x13 0xfe) +Test case: `pack( -5551212 ) == b'\x11\xabK\x93'` +Status: Standard + +There is some variation in the ability of language bindings to encode and decode values at the outside of the possible range, because of different native representations of integers. + +### **Positive arbitrary-precision Integer** + +Typecodes: 0x1d, 0x1e +Encoding: Not defined yet +Status: Reserved; 0x1d used in Python and Java +These typecodes are reserved for encoding integers larger than 8 bytes. Presumably the type code would be followed by some encoding of the length, followed by the big endian one’s complement number. Reserving two typecodes for each of positive and negative numbers is probably overkill, but until there’s a design in place we might as well not use them. In the Python and Java implementations, 0x1d stores positive numbers which are expressed with between 9 and 255 bytes. The first byte following the type code (0x1d) is a single byte expressing the number of bytes in the integer, followed by that number of bytes representing the number in big endian order. + +### **IEEE Binary Floating Point** + +Typecodes: + 0x20 - float (32 bits) + 0x21 - double (64 bits) + 0x22 - long double (80 bits) +Length: 4 - 10 bytes +Test case: `pack( -42f ) == b'=\xd7\xff\xff'` +Encoding: Big-endian IEEE binary representation, followed by the following transformation: +` if ord(rep[0])&0x80: # Check sign bit` +` # Flip all bits, this is easier in most other languages!` +` return "".join( chr(0xff^ord(r)) for r in rep )` +` else:` +` # Flip just the sign bit` +` return chr(0x80^ord(rep[0])) + rep[1:]` +Status: Standard (float and double) ; Reserved (long double) + +The binary representation should not be assumed to be canonicalized (as to multiple representations of NaN, for example) by a reader. This order sorts all numbers in the following way: + +* All negative NaN values with order determined by mantissa bits (which are semantically meaningless) +* Negative inifinity +* All real numbers in the standard order (except that -0.0 < 0.0) +* Positive infinity +* All positive NaN values with order determined by mantissa bits + +This should be equivalent to the standard IEEE total ordering. + +### **Arbitrary-precision Decimal** + +Typecodes: 0x23, 0x24 +Length: Arbitrary +Encoding: Scale followed by arbitrary precision integer +Status: Reserved + +This encoding format has been used by layers. Note that this encoding makes almost no guarantees about ordering properties of tuple-encoded values and should thus generally be avoided. + +### **(Deprecated)**** True Value** + +Typecode: 0x25 +Length: 0 bytes +Status: Deprecated + +### **False Value** + +Typecode: 0x26 +Length: 0 bytes +Status: Standard + +### **True Value** + +Typecode: 0x27 +Length: 0 bytes +Status: Standard + +Note that false will sort before true with the given encoding. + +### **RFC 4122 UUID** + +Typecode: 0x30 +Length: 16 bytes +Encoding: Network byte order as defined in the rfc: [_http://www.ietf.org/rfc/rfc4122.txt_](http://www.ietf.org/rfc/rfc4122.txt) +Status: Standard + +This is equivalent to the unsigned byte ordering of the UUID bytes in big-endian order. + +### **64 bit identifier** + +Typecode: 0x31 +Length: 8 bytes +Encoding: Big endian unsigned 8-byte integer (typically random or perhaps semi-sequential) +Status: Reserved + +There’s definitely some question of whether this deserves to be separated from a plain old 64 bit integer, but a separate type was desired in one of the third-party bindings. This type has not been ported over to the first-party bindings. + +### **80 bit versionstamp** + +Typecode: 0x32 +Length: 10 bytes +Encoding: Big endian 10-byte integer. First/high 8 bytes are a database version, next two are batch version. +Status: Reserved + +### 96 Bit Versionstamp + +Typecode: 0x33 +Length: 12 bytes +Encoding: Big endian 12-byte integer. First/high 8 bytes are a database version, next two are batch version, next two are ordering within transaction. +Status: Reserved + +The two versionstamp typecodes are reserved for future work adding compatibility between the tuple layer and versionstamp operations. Note that the first 80 bits of the 96 bit versionstamp are the same as the contents of the 80 bit versionstamp, and they correspond to what the `SET_VERSIONSTAMP_KEY` mutation will write into a database key , i.e., the first 8 bytes are a big-endian, unsigned version corresponding to the commit version of a transaction, and the next to bytes are a big-endian, unsigned batch number ordering transactions are committed at the same version. The final two bytes of the 96 bit versionstamp are written by the client and should order writes within a single transaction, thereby providing a global order for all versions. + +### **User type codes** + +Typecode: 0x40 - 0x4f +Length: Variable (user defined) +Encoding: User defined +Status: Reserved + +These type codes may be used by third party extenders without coordinating with us. If used in shipping software, the software should use the directory layer and specify a specific layer name when opening its directories to eliminate the possibility of conflicts. + +The only way in which future official, otherwise backward-compatible versions of the tuple layer would be expected to use these type codes is to implement some kind of actual extensibility point for this purpose - they will not be used for standard types. + +### Escape Character + +Typecode: 0xff +Length: N/A +Encoding: N/A +Status: Reserved + +This type code is not used for anything. However, several of the other tuple types depend on this type code not being used as a type code for other types in order to correctly escape bytes in an order-preserving way. Therefore, it would be a Very Bad Idea™ for future development to start using this code for anything else. From 833c388d89a6583b8011b8b8c44bcd375da2ecd5 Mon Sep 17 00:00:00 2001 From: Alec Grieser Date: Fri, 25 Aug 2017 11:34:06 -0700 Subject: [PATCH 04/30] tried adding some newlines --- design/tuple.md | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/design/tuple.md b/design/tuple.md index b1c82f5250..dea100a8b6 100644 --- a/design/tuple.md +++ b/design/tuple.md @@ -9,16 +9,16 @@ Status: Deprecated means that a previous layer used this type, but issues with t ### **Null Value** -Typecode: 0x00 -Length: 0 bytes +Typecode: 0x00 +Length: 0 bytes Status: Standard ### **Byte String** -Typecode: 0x01 -Length: Variable (terminated by` [\x00]![\xff]`) -Encoding: `b'\x01' + value.replace(b'\x00', b'\x00\xFF') + b'\x00'` -Test case: `pack(“foo\x00bar”) == b'\x01foo\x00\xffbar\x00'` +Typecode: 0x01 +Length: Variable (terminated by` [\x00]![\xff]`) +Encoding: `b'\x01' + value.replace(b'\x00', b'\x00\xFF') + b'\x00'` +Test case: `pack(“foo\x00bar”) == b'\x01foo\x00\xffbar\x00'` Status: Standard In other words, byte strings are null terminated with null values occurring in the string escaped in an order-preserving way. From 850ab5b64e60cbd0b325d976e5460154e994d866 Mon Sep 17 00:00:00 2001 From: Alec Grieser Date: Fri, 25 Aug 2017 11:36:33 -0700 Subject: [PATCH 05/30] MOAR NEWLINES --- design/tuple.md | 136 +++++++++++++++++++++++++----------------------- 1 file changed, 70 insertions(+), 66 deletions(-) diff --git a/design/tuple.md b/design/tuple.md index dea100a8b6..feca15e9e5 100644 --- a/design/tuple.md +++ b/design/tuple.md @@ -25,77 +25,81 @@ In other words, byte strings are null terminated with null values occurring in t ### **Unicode String** -Typecode: 0x02 -Length: Variable (terminated by [\x00]![\xff]) -Encoding: `b'\x02' + value.encode('utf-8').replace(b'\x00', b'\x00\xFF') + b'\x00'` -Test case: `pack( u"F\u00d4O\u0000bar" ) == b'\x02F\xc3\x94O\x00\xffbar\x00'` +Typecode: 0x02 +Length: Variable (terminated by [\x00]![\xff]) +Encoding: `b'\x02' + value.encode('utf-8').replace(b'\x00', b'\x00\xFF') + b'\x00'` +Test case: `pack( u"F\u00d4O\u0000bar" ) == b'\x02F\xc3\x94O\x00\xffbar\x00'` Status: Standard This is the same way that byte strings are encoded, but first, the unicode string is encoded in UTF-8. ### **(DEBRECATED) Nested Tuple** -Typecodes: 0x03-0x04 -Length: Variable (terminated by 0x04 type code) -Status: Deprecated +Typecodes: 0x03-0x04 +Length: Variable (terminated by 0x04 type code) +Status: Deprecated + This encoding was used by a few layers. However, it had ordering problems when one tuple was a prefix of another and the type of the first element in the longer tuple was either null or a byte string. For an example, consider the empty tuple and the tuple containing only null. In the old scheme, the empty tuple would be encoded as `\x03\x04` while the tuple containing only null would be encoded as `\x03\x00\x04`, so the second tuple would sort first based on their bytes, which is incorrect semantically. ### **Nested Tuple** -Typecodes: 0x05 -Length: Variable (terminated by `[\x00]![\xff]` at beginning of nested element) -Encoding: `b'\x05' + ''.join(map(lambda x: b'\x00\xff' if x is None else pack(x), value)) + b'\x00'` -Test case: `pack( (“foo\x00bar”, None, ()) ) == b'\x05\x01foo\x00\xffbar\x00\x00\xff\x05\x00\x00'` +Typecodes: 0x05 +Length: Variable (terminated by `[\x00]![\xff]` at beginning of nested element) +Encoding: `b'\x05' + ''.join(map(lambda x: b'\x00\xff' if x is None else pack(x), value)) + b'\x00'` +Test case: `pack( (“foo\x00bar”, None, ()) ) == b'\x05\x01foo\x00\xffbar\x00\x00\xff\x05\x00\x00'` Status: Standard - The list is ended with a 0x00 byte. Nulls within the tuple are encoded as `\x00\xff`. There is no other null escaping. In particular, 0x00 bytes that are within the nested types can be left as-is as they are passed over when decoding the interior types. To show how this fixes the bug in the previous version of nested tuples, the empty tuple is now encoded as `\x05\x00` while the tuple containing only null is encoded as `\x05\x00\xff\x00`, so the first tuple will sort first. +The list is ended with a 0x00 byte. Nulls within the tuple are encoded as `\x00\xff`. There is no other null escaping. In particular, 0x00 bytes that are within the nested types can be left as-is as they are passed over when decoding the interior types. To show how this fixes the bug in the previous version of nested tuples, the empty tuple is now encoded as `\x05\x00` while the tuple containing only null is encoded as `\x05\x00\xff\x00`, so the first tuple will sort first. ### **Negative arbitrary-precision Integer** -Typecodes: 0x0a, 0x0b -Encoding: Not defined yet +Typecodes: 0x0a, 0x0b +Encoding: Not defined yet Status: Reserved; 0x0b used in Python and Java These typecodes are reserved for encoding integers larger than 8 bytes. Presumably the type code would be followed by some encoding of the length, followed by the big endian one’s complement number. Reserving two typecodes for each of positive and negative numbers is probably overkill, but until there’s a design in place we might as well not use them. In the Python and Java implementations, 0x0b stores negative numbers which are expressed with between 9 and 255 bytes. The first byte following the type code (0x0b) is a single byte expressing the number of bytes in the integer (with its bits flipped to preserve order), followed by that number of bytes representing the number in big endian order in one's complement. ### **Integer** -Typecodes: 0x0c - 0x1c - 0x0c is an 8 byte negative number - 0x13 is a 1 byte negative number - 0x14 is a zero - 0x15 is a 1 byte positive number - 0x1c is an 8 byte positive number -Length: Depends on typecode (0-8 bytes) -Encoding: positive numbers are big endian - negative numbers are big endian one’s complement (so -1 is 0x13 0xfe) -Test case: `pack( -5551212 ) == b'\x11\xabK\x93'` +Typecodes: 0x0c - 0x1c + 0x0c is an 8 byte negative number + 0x13 is a 1 byte negative number + 0x14 is a zero + 0x15 is a 1 byte positive number + 0x1c is an 8 byte positive number +Length: Depends on typecode (0-8 bytes) +Encoding: positive numbers are big endian + negative numbers are big endian one’s complement (so -1 is 0x13 0xfe) +Test case: `pack( -5551212 ) == b'\x11\xabK\x93'` Status: Standard There is some variation in the ability of language bindings to encode and decode values at the outside of the possible range, because of different native representations of integers. ### **Positive arbitrary-precision Integer** -Typecodes: 0x1d, 0x1e -Encoding: Not defined yet +Typecodes: 0x1d, 0x1e +Encoding: Not defined yet Status: Reserved; 0x1d used in Python and Java + These typecodes are reserved for encoding integers larger than 8 bytes. Presumably the type code would be followed by some encoding of the length, followed by the big endian one’s complement number. Reserving two typecodes for each of positive and negative numbers is probably overkill, but until there’s a design in place we might as well not use them. In the Python and Java implementations, 0x1d stores positive numbers which are expressed with between 9 and 255 bytes. The first byte following the type code (0x1d) is a single byte expressing the number of bytes in the integer, followed by that number of bytes representing the number in big endian order. ### **IEEE Binary Floating Point** -Typecodes: - 0x20 - float (32 bits) - 0x21 - double (64 bits) - 0x22 - long double (80 bits) -Length: 4 - 10 bytes -Test case: `pack( -42f ) == b'=\xd7\xff\xff'` -Encoding: Big-endian IEEE binary representation, followed by the following transformation: -` if ord(rep[0])&0x80: # Check sign bit` -` # Flip all bits, this is easier in most other languages!` -` return "".join( chr(0xff^ord(r)) for r in rep )` -` else:` -` # Flip just the sign bit` -` return chr(0x80^ord(rep[0])) + rep[1:]` +Typecodes: + 0x20 - float (32 bits) + 0x21 - double (64 bits) + 0x22 - long double (80 bits) +Length: 4 - 10 bytes +Test case: `pack( -42f ) == b'=\xd7\xff\xff'` +Encoding: Big-endian IEEE binary representation, followed by the following transformation: +```python + if ord(rep[0])&0x80: # Check sign bit + # Flip all bits, this is easier in most other languages! + return "".join( chr(0xff^ord(r)) for r in rep ) + else: + # Flip just the sign bit + return chr(0x80^ord(rep[0])) + rep[1:] +``` Status: Standard (float and double) ; Reserved (long double) The binary representation should not be assumed to be canonicalized (as to multiple representations of NaN, for example) by a reader. This order sorts all numbers in the following way: @@ -110,72 +114,72 @@ This should be equivalent to the standard IEEE total ordering. ### **Arbitrary-precision Decimal** -Typecodes: 0x23, 0x24 -Length: Arbitrary -Encoding: Scale followed by arbitrary precision integer +Typecodes: 0x23, 0x24 +Length: Arbitrary +Encoding: Scale followed by arbitrary precision integer Status: Reserved This encoding format has been used by layers. Note that this encoding makes almost no guarantees about ordering properties of tuple-encoded values and should thus generally be avoided. ### **(Deprecated)**** True Value** -Typecode: 0x25 -Length: 0 bytes +Typecode: 0x25 +Length: 0 bytes Status: Deprecated ### **False Value** -Typecode: 0x26 -Length: 0 bytes +Typecode: 0x26 +Length: 0 bytes Status: Standard ### **True Value** -Typecode: 0x27 -Length: 0 bytes +Typecode: 0x27 +Length: 0 bytes Status: Standard Note that false will sort before true with the given encoding. ### **RFC 4122 UUID** -Typecode: 0x30 -Length: 16 bytes -Encoding: Network byte order as defined in the rfc: [_http://www.ietf.org/rfc/rfc4122.txt_](http://www.ietf.org/rfc/rfc4122.txt) +Typecode: 0x30 +Length: 16 bytes +Encoding: Network byte order as defined in the rfc: [_http://www.ietf.org/rfc/rfc4122.txt_](http://www.ietf.org/rfc/rfc4122.txt) Status: Standard This is equivalent to the unsigned byte ordering of the UUID bytes in big-endian order. ### **64 bit identifier** -Typecode: 0x31 -Length: 8 bytes -Encoding: Big endian unsigned 8-byte integer (typically random or perhaps semi-sequential) +Typecode: 0x31 +Length: 8 bytes +Encoding: Big endian unsigned 8-byte integer (typically random or perhaps semi-sequential) Status: Reserved There’s definitely some question of whether this deserves to be separated from a plain old 64 bit integer, but a separate type was desired in one of the third-party bindings. This type has not been ported over to the first-party bindings. ### **80 bit versionstamp** -Typecode: 0x32 -Length: 10 bytes -Encoding: Big endian 10-byte integer. First/high 8 bytes are a database version, next two are batch version. +Typecode: 0x32 +Length: 10 bytes +Encoding: Big endian 10-byte integer. First/high 8 bytes are a database version, next two are batch version. Status: Reserved ### 96 Bit Versionstamp -Typecode: 0x33 -Length: 12 bytes -Encoding: Big endian 12-byte integer. First/high 8 bytes are a database version, next two are batch version, next two are ordering within transaction. +Typecode: 0x33 +Length: 12 bytes +Encoding: Big endian 12-byte integer. First/high 8 bytes are a database version, next two are batch version, next two are ordering within transaction. Status: Reserved The two versionstamp typecodes are reserved for future work adding compatibility between the tuple layer and versionstamp operations. Note that the first 80 bits of the 96 bit versionstamp are the same as the contents of the 80 bit versionstamp, and they correspond to what the `SET_VERSIONSTAMP_KEY` mutation will write into a database key , i.e., the first 8 bytes are a big-endian, unsigned version corresponding to the commit version of a transaction, and the next to bytes are a big-endian, unsigned batch number ordering transactions are committed at the same version. The final two bytes of the 96 bit versionstamp are written by the client and should order writes within a single transaction, thereby providing a global order for all versions. ### **User type codes** -Typecode: 0x40 - 0x4f -Length: Variable (user defined) -Encoding: User defined +Typecode: 0x40 - 0x4f +Length: Variable (user defined) +Encoding: User defined Status: Reserved These type codes may be used by third party extenders without coordinating with us. If used in shipping software, the software should use the directory layer and specify a specific layer name when opening its directories to eliminate the possibility of conflicts. @@ -184,9 +188,9 @@ The only way in which future official, otherwise backward-compatible versions of ### Escape Character -Typecode: 0xff -Length: N/A -Encoding: N/A +Typecode: 0xff +Length: N/A +Encoding: N/A Status: Reserved This type code is not used for anything. However, several of the other tuple types depend on this type code not being used as a type code for other types in order to correctly escape bytes in an order-preserving way. Therefore, it would be a Very Bad Idea™ for future development to start using this code for anything else. From fef18ee0a3ee70d2c119ef340f664266d48924f6 Mon Sep 17 00:00:00 2001 From: Alec Grieser Date: Fri, 25 Aug 2017 11:38:21 -0700 Subject: [PATCH 06/30] worked through some formatting issues --- design/tuple.md | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/design/tuple.md b/design/tuple.md index feca15e9e5..58f7533c36 100644 --- a/design/tuple.md +++ b/design/tuple.md @@ -62,11 +62,11 @@ These typecodes are reserved for encoding integers larger than 8 bytes. Presumab ### **Integer** Typecodes: 0x0c - 0x1c - 0x0c is an 8 byte negative number - 0x13 is a 1 byte negative number - 0x14 is a zero - 0x15 is a 1 byte positive number - 0x1c is an 8 byte positive number + 0x0c is an 8 byte negative number + 0x13 is a 1 byte negative number + 0x14 is a zero + 0x15 is a 1 byte positive number + 0x1c is an 8 byte positive number Length: Depends on typecode (0-8 bytes) Encoding: positive numbers are big endian negative numbers are big endian one’s complement (so -1 is 0x13 0xfe) @@ -86,9 +86,9 @@ These typecodes are reserved for encoding integers larger than 8 bytes. Presumab ### **IEEE Binary Floating Point** Typecodes: - 0x20 - float (32 bits) - 0x21 - double (64 bits) - 0x22 - long double (80 bits) + 0x20 - float (32 bits) + 0x21 - double (64 bits) + 0x22 - long double (80 bits) Length: 4 - 10 bytes Test case: `pack( -42f ) == b'=\xd7\xff\xff'` Encoding: Big-endian IEEE binary representation, followed by the following transformation: @@ -121,7 +121,7 @@ Status: Reserved This encoding format has been used by layers. Note that this encoding makes almost no guarantees about ordering properties of tuple-encoded values and should thus generally be avoided. -### **(Deprecated)**** True Value** +### **(Deprecated) True Value** Typecode: 0x25 Length: 0 bytes From 9b10fb627cfeb233bf703909ac535313fd8b6a79 Mon Sep 17 00:00:00 2001 From: Alec Grieser Date: Fri, 25 Aug 2017 11:39:08 -0700 Subject: [PATCH 07/30] capitalize DEPRECATED --- design/tuple.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/design/tuple.md b/design/tuple.md index 58f7533c36..0cc394c280 100644 --- a/design/tuple.md +++ b/design/tuple.md @@ -121,7 +121,7 @@ Status: Reserved This encoding format has been used by layers. Note that this encoding makes almost no guarantees about ordering properties of tuple-encoded values and should thus generally be avoided. -### **(Deprecated) True Value** +### **(DEPRECATED) True Value** Typecode: 0x25 Length: 0 bytes From 9e2b0debcd37e3d491061bd1379b91910bf634f8 Mon Sep 17 00:00:00 2001 From: Alec Grieser Date: Fri, 25 Aug 2017 11:43:13 -0700 Subject: [PATCH 08/30] made emboldening and capitalization consistent --- design/tuple.md | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/design/tuple.md b/design/tuple.md index 0cc394c280..aed8858f32 100644 --- a/design/tuple.md +++ b/design/tuple.md @@ -159,14 +159,14 @@ Status: Reserved There’s definitely some question of whether this deserves to be separated from a plain old 64 bit integer, but a separate type was desired in one of the third-party bindings. This type has not been ported over to the first-party bindings. -### **80 bit versionstamp** +### **80 Bit versionstamp** Typecode: 0x32 Length: 10 bytes Encoding: Big endian 10-byte integer. First/high 8 bytes are a database version, next two are batch version. Status: Reserved -### 96 Bit Versionstamp +### **96 Bit Versionstamp** Typecode: 0x33 Length: 12 bytes @@ -186,7 +186,7 @@ These type codes may be used by third party extenders without coordinating with The only way in which future official, otherwise backward-compatible versions of the tuple layer would be expected to use these type codes is to implement some kind of actual extensibility point for this purpose - they will not be used for standard types. -### Escape Character +### **Escape Character** Typecode: 0xff Length: N/A From 86d025f9430f983abb63b229de66b9af31af8cdb Mon Sep 17 00:00:00 2001 From: Stephen Atherton Date: Sun, 27 Aug 2017 22:22:32 -0700 Subject: [PATCH 09/30] Bug fix: Metric base enabled state was not being initialized. Metrics are configured to be disabled upon construction, however if during construction it appears that a metric was initially enabled then a crash would result if the MetricsCollection global was not created. --- flow/TDMetric.actor.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/flow/TDMetric.actor.h b/flow/TDMetric.actor.h index 093b8ebc3a..6f5344985d 100755 --- a/flow/TDMetric.actor.h +++ b/flow/TDMetric.actor.h @@ -680,7 +680,7 @@ struct TimeDescriptor { }; struct BaseMetric { - BaseMetric(MetricNameRef const &name) : metricName(name), pCollection(nullptr), registered(false) { + BaseMetric(MetricNameRef const &name) : metricName(name), pCollection(nullptr), registered(false), enabled(false) { setConfig(false); } virtual ~BaseMetric() { From 581bd6c8ed96bbe81ca4e80b132effbaa1bf643f Mon Sep 17 00:00:00 2001 From: Alvin Moore Date: Mon, 28 Aug 2017 10:53:56 -0700 Subject: [PATCH 10/30] Added option to delay the displaying of the simulation workers --- fdbserver/workloads/DummyWorkload.actor.cpp | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) diff --git a/fdbserver/workloads/DummyWorkload.actor.cpp b/fdbserver/workloads/DummyWorkload.actor.cpp index a936e5a182..ee0bd55cf8 100644 --- a/fdbserver/workloads/DummyWorkload.actor.cpp +++ b/fdbserver/workloads/DummyWorkload.actor.cpp @@ -24,10 +24,12 @@ // The workload that do nothing. It can be used for waiting for quiescence struct DummyWorkload : TestWorkload { bool displayWorkers; + double displayDelay; DummyWorkload(WorkloadContext const& wcx) : TestWorkload(wcx) { displayWorkers = getOption(options, LiteralStringRef("displayWorkers"), true); + displayDelay = getOption(options, LiteralStringRef("displayDelay"), 0.0); } virtual std::string description() { @@ -35,8 +37,16 @@ struct DummyWorkload : TestWorkload { } virtual Future start(Database const& cx) { - if ((clientId == 0) && (displayWorkers)) - g_simulator.displayWorkers(); + if ((clientId == 0) && (displayWorkers)) { + return _start(this, cx); + } + return Void(); + } + + ACTOR static Future _start( DummyWorkload* self, Database cx) { + if (self->displayDelay > 0.0) + Void _ = wait(delay(self->displayDelay)); + g_simulator.displayWorkers(); return Void(); } From 44e0df78c54e7c681653c93760011223feb09387 Mon Sep 17 00:00:00 2001 From: Alvin Moore Date: Mon, 28 Aug 2017 11:25:37 -0700 Subject: [PATCH 11/30] Added support for tracking roles for simulation workers Fixed the exclusion and inclusion address simulation API and integration within workloads Added more information within trace events for simulation --- fdbrpc/sim2.actor.cpp | 82 +++--- fdbrpc/simulator.h | 75 +++++- fdbserver/ClusterController.actor.cpp | 46 ++-- fdbserver/SimulatedCluster.actor.cpp | 55 ++-- fdbserver/worker.actor.cpp | 10 +- .../workloads/MachineAttrition.actor.cpp | 6 +- .../workloads/RemoveServersSafely.actor.cpp | 237 +++++++++++------- 7 files changed, 342 insertions(+), 169 deletions(-) diff --git a/fdbrpc/sim2.actor.cpp b/fdbrpc/sim2.actor.cpp index bbb094f93e..9e30626193 100644 --- a/fdbrpc/sim2.actor.cpp +++ b/fdbrpc/sim2.actor.cpp @@ -77,12 +77,12 @@ void ISimulator::displayWorkers() const } printf("DataHall ZoneId\n"); - printf(" Address Name Class Excluded Failed Rebooting DataFolder\n"); + printf(" Address Name Class Excluded Failed Rebooting Role DataFolder\n"); for (auto& zoneRecord : zoneMap) { printf("\n%s\n", zoneRecord.first.c_str()); for (auto& processInfo : zoneRecord.second) { - printf(" %9s %-10s %-7s %-8s %-6s %-9s %-40s\n", - processInfo->address.toString().c_str(), processInfo->name, processInfo->startingClass.toString().c_str(), (processInfo->excluded ? "True" : "False"), (processInfo->failed ? "True" : "False"), (processInfo->rebooting ? "True" : "False"), processInfo->dataFolder); + printf(" %9s %-10s%-13s%-8s %-6s %-9s %-48s %-40s\n", + processInfo->address.toString().c_str(), processInfo->name, processInfo->startingClass.toString().c_str(), (processInfo->excluded ? "True" : "False"), (processInfo->failed ? "True" : "False"), (processInfo->rebooting ? "True" : "False"), getRoles(processInfo->address).c_str(), processInfo->dataFolder); } } @@ -1001,8 +1001,11 @@ public: for (auto processInfo : getAllProcesses()) { // Add non-test processes (ie. datahall is not be set for test processes) if (processInfo->isAvailableClass()) { + // Ignore excluded machines + if (processInfo->excluded) + ; // Mark all of the unavailable as dead - if (!processInfo->isAvailable()) + else if (!processInfo->isAvailable()) processesDead.push_back(processInfo); else if (protectedAddresses.count(processInfo->address)) processesLeft.push_back(processInfo); @@ -1056,22 +1059,22 @@ public: } // Reboot and Delete if remaining machines do NOT fulfill policies else if ((kt != RebootAndDelete) && (kt != RebootProcessAndDelete) && (!processesLeft.validate(tLogPolicy))) { - auto newKt = (g_random->random01() < 0.33) ? RebootAndDelete : Reboot; + newKt = (g_random->random01() < 0.33) ? RebootAndDelete : Reboot; canSurvive = false; TraceEvent("KillChanged").detail("KillType", kt).detail("NewKillType", newKt).detail("tLogPolicy", tLogPolicy->info()).detail("ProcessesLeft", processesLeft.size()).detail("ProcessesDead", processesDead.size()).detail("RemainingZones", ::describeZones(localitiesLeft)).detail("RemainingDataHalls", ::describeDataHalls(localitiesLeft)).detail("Reason", "tLogPolicy does not validates against remaining processes."); } else if ((kt != RebootAndDelete) && (kt != RebootProcessAndDelete) && (!processesLeft.validate(storagePolicy))) { - auto newKt = (g_random->random01() < 0.33) ? RebootAndDelete : Reboot; + newKt = (g_random->random01() < 0.33) ? RebootAndDelete : Reboot; canSurvive = false; TraceEvent("KillChanged").detail("KillType", kt).detail("NewKillType", newKt).detail("storagePolicy", storagePolicy->info()).detail("ProcessesLeft", processesLeft.size()).detail("ProcessesDead", processesDead.size()).detail("RemainingZones", ::describeZones(localitiesLeft)).detail("RemainingDataHalls", ::describeDataHalls(localitiesLeft)).detail("Reason", "storagePolicy does not validates against remaining processes."); } else if ((kt != RebootAndDelete) && (kt != RebootProcessAndDelete) && (nQuorum > uniqueMachines.size())) { - auto newKt = (g_random->random01() < 0.33) ? RebootAndDelete : Reboot; + newKt = (g_random->random01() < 0.33) ? RebootAndDelete : Reboot; canSurvive = false; TraceEvent("KillChanged").detail("KillType", kt).detail("NewKillType", newKt).detail("storagePolicy", storagePolicy->info()).detail("ProcessesLeft", processesLeft.size()).detail("ProcessesDead", processesDead.size()).detail("RemainingZones", ::describeZones(localitiesLeft)).detail("RemainingDataHalls", ::describeDataHalls(localitiesLeft)).detail("Quorum", nQuorum).detail("Machines", uniqueMachines.size()).detail("Reason", "Not enough unique machines to perform auto configuration of coordinators."); } else { - TraceEvent("CanSurviveKills").detail("KillType", kt).detail("ProcessesLeft", processesLeft.size()).detail("ProcessesDead", processesDead.size()).detail("DeadZones", ::describeZones(localitiesDead)).detail("DeadDataHalls", ::describeDataHalls(localitiesDead)).detail("tLogPolicy", tLogPolicy->info()).detail("storagePolicy", storagePolicy->info()).detail("Quorum", nQuorum).detail("Machines", uniqueMachines.size()).detail("ZonesLeft", ::describeZones(localitiesLeft)).detail("ValidateRemaining", processesLeft.validate(tLogPolicy)); + TraceEvent("CanSurviveKills").detail("KillType", kt).detail("ProcessesLeft", processesLeft.size()).detail("ProcessesDead", processesDead.size()).detail("DeadZones", ::describeZones(localitiesDead)).detail("DeadDataHalls", ::describeDataHalls(localitiesDead)).detail("tLogPolicy", tLogPolicy->info()).detail("storagePolicy", storagePolicy->info()).detail("Quorum", nQuorum).detail("Machines", uniqueMachines.size()).detail("ZonesLeft", ::describeZones(localitiesLeft)).detail("DataHallsLeft", ::describeDataHalls(localitiesLeft)).detail("ValidateRemaining", processesLeft.validate(tLogPolicy)); } } if (newKillType) *newKillType = newKt; @@ -1095,12 +1098,12 @@ public: TEST( kt == InjectFaults ); // Simulated machine was killed with faults if (kt == KillInstantly) { - TraceEvent(SevWarn, "FailMachine").detail("Name", machine->name).detail("Address", machine->address).detailext("ZoneId", machine->locality.zoneId()).detail("Process", describe(*machine)).detail("Rebooting", machine->rebooting).backtrace(); + TraceEvent(SevWarn, "FailMachine", machine->locality.zoneId()).detail("Name", machine->name).detail("Address", machine->address).detailext("ZoneId", machine->locality.zoneId()).detail("Process", describe(*machine)).detail("Rebooting", machine->rebooting).detail("Protected", protectedAddresses.count(machine->address)).backtrace(); // This will remove all the "tracked" messages that came from the machine being killed latestEventCache.clear(); machine->failed = true; } else if (kt == InjectFaults) { - TraceEvent(SevWarn, "FaultMachine").detail("Name", machine->name).detail("Address", machine->address).detailext("ZoneId", machine->locality.zoneId()).detail("Process", describe(*machine)).detail("Rebooting", machine->rebooting).backtrace(); + TraceEvent(SevWarn, "FaultMachine", machine->locality.zoneId()).detail("Name", machine->name).detail("Address", machine->address).detailext("ZoneId", machine->locality.zoneId()).detail("Process", describe(*machine)).detail("Rebooting", machine->rebooting).detail("Protected", protectedAddresses.count(machine->address)).backtrace(); should_inject_fault = simulator_should_inject_fault; machine->fault_injection_r = g_random->randomUniqueID().first(); machine->fault_injection_p1 = 0.1; @@ -1111,8 +1114,10 @@ public: ASSERT(!protectedAddresses.count(machine->address) || machine->rebooting); } virtual void rebootProcess( ProcessInfo* process, KillType kt ) { - if( kt == RebootProcessAndDelete && protectedAddresses.count(process->address) ) + if( kt == RebootProcessAndDelete && protectedAddresses.count(process->address) ) { + TraceEvent("RebootChanged").detail("ZoneId", process->locality.describeZone()).detail("KillType", RebootProcess).detail("OrigKillType", kt).detail("Reason", "Protected process"); kt = RebootProcess; + } doReboot( process, kt ); } virtual void rebootProcess(Optional> zoneId, bool allProcesses ) { @@ -1157,6 +1162,7 @@ public: TEST(kt == InjectFaults); // Trying to kill by injecting faults if(speedUpSimulation && !forceKill) { + TraceEvent(SevWarn, "AbortedKill", zoneId).detailext("ZoneId", zoneId).detail("Reason", "Unforced kill within speedy simulation.").backtrace(); return false; } @@ -1181,15 +1187,25 @@ public: if ((kt == KillInstantly) || (kt == InjectFaults) || (kt == RebootAndDelete) || (kt == RebootProcessAndDelete)) { std::vector processesLeft, processesDead; + int protectedWorker = 0, unavailable = 0, excluded = 0; for (auto machineRec : machines) { for (auto processInfo : machineRec.second.processes) { // Add non-test processes (ie. datahall is not be set for test processes) if (processInfo->isAvailableClass()) { - if (!processInfo->isAvailable()) + // Do not include any excluded machines + if (processInfo->excluded) { processesDead.push_back(processInfo); - else if (protectedAddresses.count(processInfo->address)) + excluded ++; + } + else if (!processInfo->isAvailable()) { + processesDead.push_back(processInfo); + unavailable ++; + } + else if (protectedAddresses.count(processInfo->address)) { processesLeft.push_back(processInfo); + protectedWorker ++; + } else if (machineRec.second.zoneId != zoneId) processesLeft.push_back(processInfo); // Add processes from dead machines and datacenter machines to dead group @@ -1202,7 +1218,7 @@ public: if ((kt != Reboot) && (!killIsSafe)) { kt = Reboot; } - TraceEvent("ChangedKillMachine", zoneId).detailext("ZoneId", zoneId).detail("KillType", kt).detail("OrigKillType", ktOrig).detail("ProcessesLeft", processesLeft.size()).detail("ProcessesDead", processesDead.size()).detail("TotalProcesses", machines.size()).detail("processesPerMachine", processesPerMachine).detail("tLogPolicy", tLogPolicy->info()).detail("storagePolicy", storagePolicy->info()); + TraceEvent("ChangedKillMachine", zoneId).detailext("ZoneId", zoneId).detail("KillType", kt).detail("OrigKillType", ktOrig).detail("ProcessesLeft", processesLeft.size()).detail("ProcessesDead", processesDead.size()).detail("TotalProcesses", machines.size()).detail("processesPerMachine", processesPerMachine).detail("Protected", protectedWorker).detail("Unavailable", unavailable).detail("Excluded", excluded).detail("ProtectedTotal", protectedAddresses.size()).detail("tLogPolicy", tLogPolicy->info()).detail("storagePolicy", storagePolicy->info()); } else if ((kt == KillInstantly) || (kt == InjectFaults)) { TraceEvent("DeadMachine", zoneId).detailext("ZoneId", zoneId).detail("KillType", kt).detail("ProcessesLeft", processesLeft.size()).detail("ProcessesDead", processesDead.size()).detail("TotalProcesses", machines.size()).detail("processesPerMachine", processesPerMachine).detail("tLogPolicy", tLogPolicy->info()).detail("storagePolicy", storagePolicy->info()); @@ -1229,31 +1245,30 @@ public: // Check if any processes on machine are rebooting if( processesOnMachine != processesPerMachine && kt >= RebootAndDelete ) { TEST(true); //Attempted reboot, but the target did not have all of its processes running - TraceEvent(SevWarn, "AbortedReboot", zoneId).detailext("ZoneId", zoneId).detail("Reason", "The target did not have all of its processes running.").detail("processes", processesOnMachine).detail("processesPerMachine", processesPerMachine).backtrace(); + TraceEvent(SevWarn, "AbortedKill", zoneId).detail("KillType", kt).detailext("ZoneId", zoneId).detail("Reason", "Machine processes does not match number of processes per machine").detail("processes", processesOnMachine).detail("processesPerMachine", processesPerMachine).backtrace(); return false; } // Check if any processes on machine are rebooting if ( processesOnMachine != processesPerMachine) { TEST(true); //Attempted reboot, but the target did not have all of its processes running - TraceEvent(SevWarn, "AbortedKill", zoneId).detailext("ZoneId", zoneId).detail("Reason", "The target did not have all of its processes running.").detail("processes", processesOnMachine).detail("processesPerMachine", processesPerMachine).backtrace(); + TraceEvent(SevWarn, "AbortedKill", zoneId).detail("KillType", kt).detailext("ZoneId", zoneId).detail("Reason", "Machine processes does not match number of processes per machine").detail("processes", processesOnMachine).detail("processesPerMachine", processesPerMachine).backtrace(); return false; } - TraceEvent("KillMachine", zoneId).detailext("ZoneId", zoneId).detail("Kt", kt).detail("KtOrig", ktOrig).detail("KilledMachines", killedMachines).detail("KillableMachines", processesOnMachine).detail("ProcessPerMachine", processesPerMachine).detail("KillChanged", kt!=ktOrig).detail("killIsSafe", killIsSafe); if (kt < RebootAndDelete ) { if(kt == InjectFaults && machines[zoneId].machineProcess != nullptr) killProcess_internal( machines[zoneId].machineProcess, kt ); for (auto& process : machines[zoneId].processes) { - TraceEvent("KillMachineProcess", zoneId).detail("KillType", kt).detail("Process", process->toString()).detail("startingClass", process->startingClass.toString()); + TraceEvent("KillMachineProcess", zoneId).detail("KillType", kt).detail("Process", process->toString()).detail("startingClass", process->startingClass.toString()).detail("failed", process->failed).detail("excluded", process->excluded).detail("rebooting", process->rebooting); if (process->startingClass != ProcessClass::TesterClass) killProcess_internal( process, kt ); } } else if ( kt == Reboot || killIsSafe) { for (auto& process : machines[zoneId].processes) { - TraceEvent("KillMachineProcess", zoneId).detail("KillType", kt).detail("Process", process->toString()).detail("startingClass", process->startingClass.toString()); + TraceEvent("KillMachineProcess", zoneId).detail("KillType", kt).detail("Process", process->toString()).detail("startingClass", process->startingClass.toString()).detail("failed", process->failed).detail("excluded", process->excluded).detail("rebooting", process->rebooting); if (process->startingClass != ProcessClass::TesterClass) doReboot(process, kt ); } @@ -1269,13 +1284,16 @@ public: int dcProcesses = 0; // Switch to a reboot, if anything protected on machine - for (auto& process : processes) { - auto processDcId = process->locality.dcId(); - auto processZoneId = process->locality.zoneId(); + for (auto& procRecord : processes) { + auto processDcId = procRecord->locality.dcId(); + auto processZoneId = procRecord->locality.zoneId(); ASSERT(processZoneId.present()); if (processDcId.present() && (processDcId == dcId)) { - if (protectedAddresses.count(process->address)) + if ((kt != Reboot) && (protectedAddresses.count(procRecord->address))) { kt = Reboot; + TraceEvent(SevWarn, "DcKillChanged").detailext("DataCenter", dcId).detail("KillType", kt).detail("OrigKillType", ktOrig) + .detail("Reason", "Datacenter has protected process").detail("ProcessAddress", procRecord->address).detail("failed", procRecord->failed).detail("rebooting", procRecord->rebooting).detail("excluded", procRecord->excluded).detail("Process", describe(*procRecord)); + } datacenterZones[processZoneId.get()] ++; dcProcesses ++; } @@ -1290,7 +1308,9 @@ public: // Add non-test processes (ie. datahall is not be set for test processes) if (processInfo->isAvailableClass()) { // Mark all of the unavailable as dead - if (!processInfo->isAvailable()) + if (processInfo->excluded) + processesDead.push_back(processInfo); + else if (!processInfo->isAvailable()) processesDead.push_back(processInfo); else if (protectedAddresses.count(processInfo->address)) processesLeft.push_back(processInfo); @@ -1304,7 +1324,7 @@ public: } if (!canKillProcesses(processesLeft, processesDead, kt, &kt)) { - TraceEvent(SevWarn, "DcKillChanged").detailext("DataCenter", dcId).detail("KillType", ktOrig).detail("NewKillType", kt); + TraceEvent(SevWarn, "DcKillChanged").detailext("DataCenter", dcId).detail("KillType", kt).detail("OrigKillType", ktOrig); } else { TraceEvent("DeadDataCenter").detailext("DataCenter", dcId).detail("KillType", kt).detail("DcZones", datacenterZones.size()).detail("DcProcesses", dcProcesses).detail("ProcessesDead", processesDead.size()).detail("ProcessesLeft", processesLeft.size()).detail("tLogPolicy", storagePolicy->info()).detail("storagePolicy", storagePolicy->info()); @@ -1319,10 +1339,13 @@ public: .detail("DcZones", datacenterZones.size()) .detail("DcProcesses", dcProcesses) .detailext("DCID", dcId) - .detail("KillType", kt); + .detail("KillType", kt) + .detail("OrigKillType", ktOrig); for (auto& datacenterZone : datacenterZones) - killMachine( datacenterZone.first, kt, (kt == RebootAndDelete), true); + killMachine( datacenterZone.first, kt, (kt == RebootAndDelete), true); +// ahm If above doesn't work, go conservative +// killMachine( datacenterZone.first, kt, false, true); } virtual void clogInterface( uint32_t ip, double seconds, ClogMode mode = ClogDefault ) { if (mode == ClogDefault) { @@ -1500,6 +1523,9 @@ static double networkLatency() { } ACTOR void doReboot( ISimulator::ProcessInfo *p, ISimulator::KillType kt ) { + TraceEvent("RebootingProcessAttempt").detailext("ZoneId", p->locality.zoneId()).detail("KillType", kt).detail("Process", p->toString()).detail("startingClass", p->startingClass.toString()).detail("failed", p->failed).detail("excluded", p->excluded).detail("rebooting", p->rebooting).detail("TaskDefaultDelay", TaskDefaultDelay); +// ASSERT(p->failed); //ahm + Void _ = wait( g_sim2.delay( 0, TaskDefaultDelay, p ) ); // Switch to the machine in question try { @@ -1512,7 +1538,7 @@ ACTOR void doReboot( ISimulator::ProcessInfo *p, ISimulator::KillType kt ) { if( p->rebooting ) return; - TraceEvent("RebootingMachine").detail("KillType", kt).detail("Address", p->address).detailext("ZoneId", p->locality.zoneId()).detailext("DataHall", p->locality.dataHallId()).detail("Locality", p->locality.toString()); + TraceEvent("RebootingProcess").detail("KillType", kt).detail("Address", p->address).detailext("ZoneId", p->locality.zoneId()).detailext("DataHall", p->locality.dataHallId()).detail("Locality", p->locality.toString()).detail("failed", p->failed).detail("excluded", p->excluded).backtrace(); p->rebooting = true; p->shutdownSignal.send( kt ); } catch (Error& e) { diff --git a/fdbrpc/simulator.h b/fdbrpc/simulator.h index e72c1505ad..7643018b25 100644 --- a/fdbrpc/simulator.h +++ b/fdbrpc/simulator.h @@ -151,17 +151,81 @@ public: virtual bool isAvailable() const = 0; virtual void displayWorkers() const; - virtual void excludeAddress(NetworkAddress const& address) { - excludedAddresses.insert(address); + virtual void addRole(NetworkAddress const& address, std::string const& role) { + roleAddresses[address][role] ++; + TraceEvent("RoleAdd").detail("Address", address).detail("Role", role).detail("Roles", roleAddresses[address].size()).detail("Value", roleAddresses[address][role]); } + + virtual void removeRole(NetworkAddress const& address, std::string const& role) { + auto addressIt = roleAddresses.find(address); + if (addressIt != roleAddresses.end()) { + auto rolesIt = addressIt->second.find(role); + if (rolesIt != addressIt->second.end()) { + if (rolesIt->second > 1) { + rolesIt->second --; + TraceEvent("RoleRemove").detail("Address", address).detail("Role", role).detail("Roles", addressIt->second.size()).detail("Value", rolesIt->second).detail("Result", "Decremented Role"); + } + else { + addressIt->second.erase(rolesIt); + if (addressIt->second.size()) { + TraceEvent("RoleRemove").detail("Address", address).detail("Role", role).detail("Roles", addressIt->second.size()).detail("Value", 0).detail("Result", "Removed Role"); + } + else { + roleAddresses.erase(addressIt); + TraceEvent("RoleRemove").detail("Address", address).detail("Role", role).detail("Roles", 0).detail("Value", 0).detail("Result", "Removed Address"); + } + } + } + else { + TraceEvent(SevWarn,"RoleRemove").detail("Address", address).detail("Role", role).detail("Result", "Role Missing"); + } + } + else { + TraceEvent(SevWarn,"RoleRemove").detail("Address", address).detail("Role", role).detail("Result", "Address Missing"); + } + } + + virtual std::string getRoles(NetworkAddress const& address, bool skipWorkers = true) const { + auto addressIt = roleAddresses.find(address); + std::string roleText; + if (addressIt != roleAddresses.end()) { + for (auto& roleIt : addressIt->second) { + if ((!skipWorkers) || (roleIt.first != "Worker")) + roleText += roleIt.first + ((roleIt.second > 1) ? format("-%d ", roleIt.second) : " "); + } + } + if (roleText.empty()) + roleText = "[unset]"; + return roleText; + } + + virtual void excludeAddress(NetworkAddress const& address) { + excludedAddresses[address]++; + TraceEvent("ExcludeAddress").detail("Address", address).detail("Value", excludedAddresses[address]); + } + virtual void includeAddress(NetworkAddress const& address) { - excludedAddresses.erase(address); + auto addressIt = excludedAddresses.find(address); + if (addressIt != excludedAddresses.end()) { + if (addressIt->second > 1) { + addressIt->second --; + TraceEvent("IncludeAddress").detail("Address", address).detail("Value", addressIt->second).detail("Result", "Decremented"); + } + else { + excludedAddresses.erase(addressIt); + TraceEvent("IncludeAddress").detail("Address", address).detail("Value", 0).detail("Result", "Removed"); + } + } + else { + TraceEvent(SevWarn,"IncludeAddress").detail("Address", address).detail("Result", "Missing"); + } } virtual void includeAllAddresses() { + TraceEvent("IncludeAddressAll").detail("AddressTotal", excludedAddresses.size()); excludedAddresses.clear(); } virtual bool isExcluded(NetworkAddress const& address) const { - return excludedAddresses.count(address) == 0; + return excludedAddresses.find(address) != excludedAddresses.end(); } virtual void disableSwapToMachine(Optional> zoneId ) { @@ -230,7 +294,8 @@ protected: private: std::set>> swapsDisabled; - std::set excludedAddresses; + std::map excludedAddresses; + std::map> roleAddresses; bool allSwapsDisabled; }; diff --git a/fdbserver/ClusterController.actor.cpp b/fdbserver/ClusterController.actor.cpp index c7315c2813..631862fe95 100644 --- a/fdbserver/ClusterController.actor.cpp +++ b/fdbserver/ClusterController.actor.cpp @@ -218,6 +218,7 @@ std::vector> getWorkersForTlogsAcrossDa std::vector unavailableLocals; LocalitySetRef logServerSet; LocalityMap>* logServerMap; + UID functionId = g_nondeterministic_random->randomUniqueID(); bool bCompleted = false; logServerSet = Reference(new LocalityMap>()); @@ -230,7 +231,7 @@ std::vector> getWorkersForTlogsAcrossDa } else { if (it.second.interf.locality.dataHallId().present()) - TraceEvent(SevWarn,"GWFTADNotAvailable", id) + TraceEvent(SevWarn,"GWFTADNotAvailable", functionId) .detail("Fitness", fitness) .detailext("Zone", it.second.interf.locality.zoneId()) .detailext("DataHall", it.second.interf.locality.dataHallId()) @@ -243,7 +244,8 @@ std::vector> getWorkersForTlogsAcrossDa .detail("Locality", it.second.interf.locality.toString()) .detail("tLogReplicationFactor", conf.tLogReplicationFactor) .detail("tLogPolicy", conf.tLogPolicy ? conf.tLogPolicy->info() : "[unset]") - .detail("DesiredLogs", conf.getDesiredLogs()); + .detail("DesiredLogs", conf.getDesiredLogs()) + .detail("InterfaceId", id); unavailableLocals.push_back(it.second.interf.locality); } } @@ -258,12 +260,13 @@ std::vector> getWorkersForTlogsAcrossDa logServerMap->add(worker.first.locality, &worker); } if (logServerSet->size() < conf.tLogReplicationFactor) { - TraceEvent(SevWarn,"GWFTADTooFew", id) + TraceEvent(SevWarn,"GWFTADTooFew", functionId) .detail("Fitness", fitness) .detail("Processes", logServerSet->size()) .detail("tLogReplicationFactor", conf.tLogReplicationFactor) .detail("tLogPolicy", conf.tLogPolicy ? conf.tLogPolicy->info() : "[unset]") - .detail("DesiredLogs", conf.getDesiredLogs()); + .detail("DesiredLogs", conf.getDesiredLogs()) + .detail("InterfaceId", id); } else if (logServerSet->size() <= conf.getDesiredLogs()) { ASSERT(conf.tLogPolicy); @@ -275,12 +278,13 @@ std::vector> getWorkersForTlogsAcrossDa break; } else { - TraceEvent(SevWarn,"GWFTADNotAcceptable", id) + TraceEvent(SevWarn,"GWFTADNotAcceptable", functionId) .detail("Fitness", fitness) .detail("Processes", logServerSet->size()) .detail("tLogReplicationFactor", conf.tLogReplicationFactor) .detail("tLogPolicy", conf.tLogPolicy ? conf.tLogPolicy->info() : "[unset]") - .detail("DesiredLogs", conf.getDesiredLogs()); + .detail("DesiredLogs", conf.getDesiredLogs()) + .detail("InterfaceId", id); } } // Try to select the desired size, if larger @@ -300,7 +304,7 @@ std::vector> getWorkersForTlogsAcrossDa results.push_back(*object); tLocalities.push_back(object->first.locality); } - TraceEvent("GWFTADBestResults", id) + TraceEvent("GWFTADBestResults", functionId) .detail("Fitness", fitness) .detail("Processes", logServerSet->size()) .detail("BestCount", bestSet.size()) @@ -308,17 +312,19 @@ std::vector> getWorkersForTlogsAcrossDa .detail("BestDataHalls", ::describeDataHalls(tLocalities)) .detail("tLogPolicy", conf.tLogPolicy ? conf.tLogPolicy->info() : "[unset]") .detail("TotalResults", results.size()) - .detail("DesiredLogs", conf.getDesiredLogs()); + .detail("DesiredLogs", conf.getDesiredLogs()) + .detail("InterfaceId", id); bCompleted = true; break; } else { - TraceEvent(SevWarn,"GWFTADNoBest", id) + TraceEvent(SevWarn,"GWFTADNoBest", functionId) .detail("Fitness", fitness) .detail("Processes", logServerSet->size()) .detail("tLogReplicationFactor", conf.tLogReplicationFactor) .detail("tLogPolicy", conf.tLogPolicy ? conf.tLogPolicy->info() : "[unset]") - .detail("DesiredLogs", conf.getDesiredLogs()); + .detail("DesiredLogs", conf.getDesiredLogs()) + .detail("InterfaceId", id); } } } @@ -331,7 +337,7 @@ std::vector> getWorkersForTlogsAcrossDa tLocalities.push_back(object->first.locality); } - TraceEvent(SevWarn, "GetTLogTeamFailed") + TraceEvent(SevWarn, "GetTLogTeamFailed", functionId) .detail("Policy", conf.tLogPolicy->info()) .detail("Processes", logServerSet->size()) .detail("Workers", id_worker.size()) @@ -344,7 +350,8 @@ std::vector> getWorkersForTlogsAcrossDa .detail("DesiredLogs", conf.getDesiredLogs()) .detail("RatingTests",SERVER_KNOBS->POLICY_RATING_TESTS) .detail("checkStable", checkStable) - .detail("PolicyGenerations",SERVER_KNOBS->POLICY_GENERATIONS).backtrace(); + .detail("PolicyGenerations",SERVER_KNOBS->POLICY_GENERATIONS) + .detail("InterfaceId", id).backtrace(); // Free the set logServerSet->clear(); @@ -356,14 +363,25 @@ std::vector> getWorkersForTlogsAcrossDa id_used[result.first.locality.processId()]++; } - TraceEvent("GetTLogTeamDone") + TraceEvent("GetTLogTeamDone", functionId) .detail("Completed", bCompleted).detail("Policy", conf.tLogPolicy->info()) .detail("Results", results.size()).detail("Processes", logServerSet->size()) .detail("Workers", id_worker.size()) .detail("Replication", conf.tLogReplicationFactor) .detail("Desired", conf.getDesiredLogs()) .detail("RatingTests",SERVER_KNOBS->POLICY_RATING_TESTS) - .detail("PolicyGenerations",SERVER_KNOBS->POLICY_GENERATIONS); + .detail("PolicyGenerations",SERVER_KNOBS->POLICY_GENERATIONS) + .detail("InterfaceId", id); + + for (auto& result : results) { + TraceEvent("GetTLogTeamWorker", functionId) + .detail("Class", result.second.toString()) + .detail("Address", result.first.address()) + .detailext("Zone", result.first.locality.zoneId()) + .detailext("DataHall", result.first.locality.dataHallId()) + .detail("isExcludedServer", conf.isExcludedServer(result.first.address())) + .detail("isAvailable", IFailureMonitor::failureMonitor().getState(result.first.storage.getEndpoint()).isAvailable()); + } // Free the set logServerSet->clear(); diff --git a/fdbserver/SimulatedCluster.actor.cpp b/fdbserver/SimulatedCluster.actor.cpp index 2e872a68d7..72929e7ceb 100644 --- a/fdbserver/SimulatedCluster.actor.cpp +++ b/fdbserver/SimulatedCluster.actor.cpp @@ -204,8 +204,8 @@ ACTOR Future simulatedFDBDRebooter( loop { auto waitTime = SERVER_KNOBS->MIN_REBOOT_TIME + (SERVER_KNOBS->MAX_REBOOT_TIME - SERVER_KNOBS->MIN_REBOOT_TIME) * g_random->random01(); cycles ++; - TraceEvent("SimulatedFDBDWait").detail("Cycles", cycles).detail("RandomId", randomId) - .detail("ProcessAddress", NetworkAddress(ip, port, true, false)) + TraceEvent("SimulatedFDBDPreWait").detail("Cycles", cycles).detail("RandomId", randomId) + .detail("Address", NetworkAddress(ip, port, true, false)) .detailext("ZoneId", localities.zoneId()) .detail("waitTime", waitTime).detail("Port", port); @@ -219,10 +219,10 @@ ACTOR Future simulatedFDBDRebooter( TraceEvent("SimulatedRebooterStarting", localities.zoneId()).detail("Cycles", cycles).detail("RandomId", randomId) .detailext("ZoneId", localities.zoneId()) .detailext("DataHall", localities.dataHallId()) - .detail("ProcessAddress", process->address.toString()) - .detail("ProcessExcluded", process->excluded) + .detail("Address", process->address.toString()) + .detail("Excluded", process->excluded) .detail("UsingSSL", useSSL); - TraceEvent("ProgramStart").detail("Cycles", cycles) + TraceEvent("ProgramStart").detail("Cycles", cycles).detail("RandomId", randomId) .detail("SourceVersion", getHGVersion()) .detail("Version", FDB_VT_VERSION) .detail("PackageName", FDB_VT_PACKAGE_NAME) @@ -248,7 +248,7 @@ ACTOR Future simulatedFDBDRebooter( } catch (Error& e) { // If in simulation, if we make it here with an error other than io_timeout but enASIOTimedOut is set then somewhere an io_timeout was converted to a different error. if(g_network->isSimulated() && e.code() != error_code_io_timeout && (bool)g_network->global(INetwork::enASIOTimedOut)) - TraceEvent(SevError, "IOTimeoutErrorSuppressed").detail("ErrorCode", e.code()).backtrace(); + TraceEvent(SevError, "IOTimeoutErrorSuppressed").detail("ErrorCode", e.code()).detail("RandomId", randomId).backtrace(); if (onShutdown.isReady() && onShutdown.isError()) throw onShutdown.getError(); if(e.code() != error_code_actor_cancelled) @@ -258,15 +258,15 @@ ACTOR Future simulatedFDBDRebooter( } TraceEvent("SimulatedFDBDDone", localities.zoneId()).detail("Cycles", cycles).detail("RandomId", randomId) - .detail("ProcessAddress", process->address) - .detail("ProcessExcluded", process->excluded) + .detail("Address", process->address) + .detail("Excluded", process->excluded) .detailext("ZoneId", localities.zoneId()) .detail("KillType", onShutdown.isReady() ? onShutdown.get() : ISimulator::None); if (!onShutdown.isReady()) onShutdown = ISimulator::InjectFaults; } catch (Error& e) { - TraceEvent(destructed ? SevInfo : SevError, "SimulatedFDBDRebooterError", localities.zoneId()).error(e, true); + TraceEvent(destructed ? SevInfo : SevError, "SimulatedFDBDRebooterError", localities.zoneId()).detail("RandomId", randomId).error(e, true); onShutdown = e; } @@ -276,6 +276,11 @@ ACTOR Future simulatedFDBDRebooter( process->rebooting = true; process->shutdownSignal.send(ISimulator::None); } + TraceEvent("SimulatedFDBDWait", localities.zoneId()).detail("Cycles", cycles).detail("RandomId", randomId) + .detail("Address", process->address) + .detail("Excluded", process->excluded) + .detail("Rebooting", process->rebooting) + .detailext("ZoneId", localities.zoneId()); Void _ = wait( g_simulator.onProcess( simProcess ) ); Void _ = wait(delay(0.00001 + FLOW_KNOBS->MAX_BUGGIFIED_DELAY)); // One last chance for the process to clean up? @@ -284,15 +289,15 @@ ACTOR Future simulatedFDBDRebooter( auto shutdownResult = onShutdown.get(); TraceEvent("SimulatedFDBDShutdown", localities.zoneId()).detail("Cycles", cycles).detail("RandomId", randomId) - .detail("ProcessAddress", process->address) - .detail("ProcessExcluded", process->excluded) + .detail("Address", process->address) + .detail("Excluded", process->excluded) .detailext("ZoneId", localities.zoneId()) .detail("KillType", shutdownResult); if( shutdownResult < ISimulator::RebootProcessAndDelete ) { TraceEvent("SimulatedFDBDLowerReboot", localities.zoneId()).detail("Cycles", cycles).detail("RandomId", randomId) - .detail("ProcessAddress", process->address) - .detail("ProcessExcluded", process->excluded) + .detail("Address", process->address) + .detail("Excluded", process->excluded) .detailext("ZoneId", localities.zoneId()) .detail("KillType", shutdownResult); return onShutdown.get(); @@ -300,7 +305,7 @@ ACTOR Future simulatedFDBDRebooter( if( onShutdown.get() == ISimulator::RebootProcessAndDelete ) { TraceEvent("SimulatedFDBDRebootAndDelete", localities.zoneId()).detail("Cycles", cycles).detail("RandomId", randomId) - .detail("ProcessAddress", process->address) + .detail("Address", process->address) .detailext("ZoneId", localities.zoneId()) .detail("KillType", shutdownResult); *coordFolder = joinPath(baseFolder, g_random->randomUniqueID().toString()); @@ -317,7 +322,7 @@ ACTOR Future simulatedFDBDRebooter( } else { TraceEvent("SimulatedFDBDJustRepeat", localities.zoneId()).detail("Cycles", cycles).detail("RandomId", randomId) - .detail("ProcessAddress", process->address) + .detail("Address", process->address) .detailext("ZoneId", localities.zoneId()) .detail("KillType", shutdownResult); } @@ -351,6 +356,7 @@ ACTOR Future simulatedMachine( state int bootCount = 0; state std::vector myFolders; state std::vector coordFolders; + state UID randomId = g_nondeterministic_random->randomUniqueID(); try { CSimpleIni ini; @@ -387,6 +393,7 @@ ACTOR Future simulatedMachine( std::string path = joinPath(myFolders[i], "fdb.cluster"); Reference clusterFile(useSeedFile ? new ClusterConnectionFile(path, connStr.toString()) : new ClusterConnectionFile(path)); processes.push_back(simulatedFDBDRebooter(clusterFile, ips[i], sslEnabled, i + 1, localities, processClass, &myFolders[i], &coordFolders[i], baseFolder, connStr, useSeedFile, runBackupAgents)); + TraceEvent("SimulatedMachineProcess", randomId).detail("Address", NetworkAddress(ips[i], i+1, true, false)).detailext("ZoneId", localities.zoneId()).detailext("DataHall", localities.dataHallId()).detail("Folder", myFolders[i]); } TEST( bootCount >= 1 ); // Simulated machine rebooted @@ -394,7 +401,7 @@ ACTOR Future simulatedMachine( TEST( bootCount >= 3 ); // Simulated machine rebooted three times ++bootCount; - TraceEvent("SimulatedMachineStart") + TraceEvent("SimulatedMachineStart", randomId) .detail("Folder0", myFolders[0]) .detail("CFolder0", coordFolders[0]) .detail("MachineIPs", toIPVectorString(ips)) @@ -410,7 +417,7 @@ ACTOR Future simulatedMachine( Void _ = wait( waitForAll( processes ) ); - TraceEvent("SimulatedMachineRebootStart") + TraceEvent("SimulatedMachineRebootStart", randomId) .detail("Folder0", myFolders[0]) .detail("CFolder0", coordFolders[0]) .detail("MachineIPs", toIPVectorString(ips)) @@ -447,7 +454,7 @@ ACTOR Future simulatedMachine( closingStr += it + ", "; } - TraceEvent("SimulatedMachineRebootAfterKills") + TraceEvent("SimulatedMachineRebootAfterKills", randomId) .detail("Folder0", myFolders[0]) .detail("CFolder0", coordFolders[0]) .detail("MachineIPs", toIPVectorString(ips)) @@ -476,12 +483,12 @@ ACTOR Future simulatedMachine( openFiles += *it + ", "; i++; } - TraceEvent("MachineFilesOpen").detail("PAddr", toIPVectorString(ips)).detail("OpenFiles", openFiles); + TraceEvent("MachineFilesOpen", randomId).detail("PAddr", toIPVectorString(ips)).detail("OpenFiles", openFiles); } else break; if( shutdownDelayCount++ >= 50 ) { // Worker doesn't shut down instantly on reboot - TraceEvent(SevError, "SimulatedFDBDFilesCheck") + TraceEvent(SevError, "SimulatedFDBDFilesCheck", randomId) .detail("PAddrs", toIPVectorString(ips)) .detailext("ZoneId", localities.zoneId()) .detailext("DataHall", localities.dataHallId()); @@ -492,8 +499,8 @@ ACTOR Future simulatedMachine( backoff = std::min( backoff + 1.0, 6.0 ); } - TraceEvent("SimulatedFDBDFilesClosed") - .detail("ProcessAddress", toIPVectorString(ips)) + TraceEvent("SimulatedFDBDFilesClosed", randomId) + .detail("Address", toIPVectorString(ips)) .detailext("ZoneId", localities.zoneId()) .detailext("DataHall", localities.dataHallId()); @@ -515,7 +522,7 @@ ACTOR Future simulatedMachine( auto rebootTime = g_random->random01() * MACHINE_REBOOT_TIME; - TraceEvent("SimulatedMachineShutdown") + TraceEvent("SimulatedMachineShutdown", randomId) .detail("Swap", swap) .detail("KillType", killType) .detail("RebootTime", rebootTime) @@ -535,7 +542,7 @@ ACTOR Future simulatedMachine( if( myFolders != toRebootFrom ) { TEST( true ); // Simulated machine swapped data folders - TraceEvent("SimulatedMachineFolderSwap") + TraceEvent("SimulatedMachineFolderSwap", randomId) .detail("OldFolder0", myFolders[0]).detail("NewFolder0", toRebootFrom[0]) .detail("MachineIPs", toIPVectorString(ips)); } diff --git a/fdbserver/worker.actor.cpp b/fdbserver/worker.actor.cpp index 8606d89258..05d45c8166 100644 --- a/fdbserver/worker.actor.cpp +++ b/fdbserver/worker.actor.cpp @@ -184,7 +184,7 @@ std::string filenameFromSample( KeyValueStoreType storeType, std::string folder, if( storeType == KeyValueStoreType::SSD_BTREE_V1 ) return joinPath( folder, sample_filename ); else if ( storeType == KeyValueStoreType::SSD_BTREE_V2 ) - return joinPath(folder, sample_filename); + return joinPath(folder, sample_filename); else if( storeType == KeyValueStoreType::MEMORY ) return joinPath( folder, sample_filename.substr(0, sample_filename.size() - 5) ); @@ -195,7 +195,7 @@ std::string filenameFromId( KeyValueStoreType storeType, std::string folder, std if( storeType == KeyValueStoreType::SSD_BTREE_V1) return joinPath( folder, prefix + id.toString() + ".fdb" ); else if (storeType == KeyValueStoreType::SSD_BTREE_V2) - return joinPath(folder, prefix + id.toString() + ".sqlite"); + return joinPath(folder, prefix + id.toString() + ".sqlite"); else if( storeType == KeyValueStoreType::MEMORY ) return joinPath( folder, prefix + id.toString() + "-" ); @@ -355,6 +355,7 @@ void startRole(UID roleId, UID workerId, std::string as, std::mapisSimulated()) g_simulator.addRole(g_network->getLocalAddress(), as); } void endRole(UID id, std::string as, std::string reason, bool ok, Error e) { @@ -386,6 +387,7 @@ void endRole(UID id, std::string as, std::string reason, bool ok, Error e) { g_roles.erase({as, id.shortString()}); StringMetricHandle(LiteralStringRef("Roles")) = roleString(g_roles, false); StringMetricHandle(LiteralStringRef("RolesWithIDs")) = roleString(g_roles, true); + if (g_network->isSimulated()) g_simulator.removeRole(g_network->getLocalAddress(), as); } ACTOR Future monitorServerDBInfo( Reference>> ccInterface, Reference connFile, LocalityData locality, Reference> dbInfo ) { @@ -621,7 +623,7 @@ ACTOR Future workerServer( Reference connFile, Refe Reference checkFile = wait( IAsyncFileSystem::filesystem()->open( joinPath(folder, validationFilename), IAsyncFile::OPEN_CREATE | IAsyncFile::OPEN_READWRITE, 0600 ) ); Void _ = wait( checkFile->sync() ); } - + if(g_network->isSimulated()) { TraceEvent("SimulatedReboot").detail("Deletion", rebootReq.deleteData ); if( rebootReq.deleteData ) { @@ -660,7 +662,7 @@ ACTOR Future workerServer( Reference connFile, Refe std::map details; details["ForMaster"] = req.recruitmentID.shortString(); details["StorageEngine"] = req.storeType.toString(); - + //FIXME: start role for every tlog instance, rather that just for the shared actor, also use a different role type for the shared actor startRole( logId, interf.id(), "SharedTLog", details ); diff --git a/fdbserver/workloads/MachineAttrition.actor.cpp b/fdbserver/workloads/MachineAttrition.actor.cpp index 41eb49a49d..5a9f198f1b 100644 --- a/fdbserver/workloads/MachineAttrition.actor.cpp +++ b/fdbserver/workloads/MachineAttrition.actor.cpp @@ -155,6 +155,7 @@ struct MachineAttritionWorkload : TestWorkload { LocalityData targetMachine = self->machines.back(); TraceEvent("Assassination").detail("TargetMachine", targetMachine.toString()) + .detailext("zoneId", targetMachine.zoneId()) .detail("Reboot", self->reboot).detail("killedMachines", killedMachines) .detail("machinesToKill", self->machinesToKill).detail("machinesToLeave", self->machinesToLeave) .detail("machines", self->machines.size()).detail("Replace", self->replacement); @@ -166,8 +167,9 @@ struct MachineAttritionWorkload : TestWorkload { g_simulator.killMachine( targetMachine.zoneId(), ISimulator::Reboot ); } } else { - TraceEvent("WorkerKill").detail("MachineCount", self->machines.size()); - if( g_random->random01() < 0.33 ) { + auto randomDouble = g_random->random01(); + TraceEvent("WorkerKill").detail("MachineCount", self->machines.size()).detail("RandomValue", randomDouble); + if (randomDouble < 0.33 ) { TraceEvent("RebootAndDelete").detail("TargetMachine", targetMachine.toString()); g_simulator.killMachine( targetMachine.zoneId(), ISimulator::RebootAndDelete ); } else { diff --git a/fdbserver/workloads/RemoveServersSafely.actor.cpp b/fdbserver/workloads/RemoveServersSafely.actor.cpp index 2df77ca5e9..6d9e2e131c 100644 --- a/fdbserver/workloads/RemoveServersSafely.actor.cpp +++ b/fdbserver/workloads/RemoveServersSafely.actor.cpp @@ -26,9 +26,6 @@ #include "fdbrpc/simulator.h" #include "fdbclient/ManagementAPI.h" -const char* removeClearEnv = getenv("REMOVE_CLEAR"); -int removeClear = removeClearEnv ? atoi(removeClearEnv) : 1; - template <> std::string describe( uint32_t const& item ) { return format("%d", item); @@ -154,6 +151,7 @@ struct RemoveServersSafelyWorkload : TestWorkload { { std::vector processes; std::set processAddrs; + UID functionId = g_nondeterministic_random->randomUniqueID(); // Get the list of process network addresses for (auto& netAddr : netAddrs) { @@ -170,24 +168,64 @@ struct RemoveServersSafelyWorkload : TestWorkload { // Get the list of processes matching network address for (auto processInfo : g_simulator.getAllProcesses()) { auto processNet = AddressExclusion(processInfo->address.ip, processInfo->address.port); - if (processAddrs.find(processNet) != processAddrs.end()) + if (processAddrs.find(processNet) != processAddrs.end()) { processes.push_back(processInfo); + TraceEvent("RemoveAndKill", functionId).detail("Step", "getProcessItem").detail("ProcessAddress", processInfo->address).detail("Process", describe(*processInfo)).detail("failed", processInfo->failed).detail("excluded", processInfo->excluded).detail("rebooting", processInfo->rebooting).detail("Protected", g_simulator.protectedAddresses.count(processInfo->address)); + } + else { + TraceEvent("RemoveAndKill", functionId).detail("Step", "getProcessNoItem").detail("ProcessAddress", processInfo->address).detail("Process", describe(*processInfo)).detail("failed", processInfo->failed).detail("excluded", processInfo->excluded).detail("rebooting", processInfo->rebooting).detail("Protected", g_simulator.protectedAddresses.count(processInfo->address)); + } } - TraceEvent("RemoveAndKill").detail("Step", "getProcesses") + TraceEvent("RemoveAndKill", functionId).detail("Step", "getProcesses") + .detail("netAddrSize",netAddrs.size()).detail("processAddrSize",processAddrs.size()) .detail("netAddrs",describe(netAddrs)).detail("processAddrs",describe(processAddrs)) .detail("Proceses", processes.size()).detail("MachineProcesses", machineProcesses.size()); - // Processes may have been destroyed causing -// ASSERT(processAddrs.size() == processes.size()); return processes; } + virtual std::vector excludeAddresses(std::set const& procAddrs) + { + // Get the updated list of processes which may have changed due to reboots, deletes, etc + std::vector procArray = getProcesses(procAddrs); + + // Include all of the excluded machines because the first command of the next section is includeall + TraceEvent("RemoveAndKill").detail("Step", "exclude addresses").detail("AddrTotal", procAddrs.size()).detail("ProcTotal", procArray.size()).detail("Addresses", describe(procAddrs)).detail("ClusterAvailable", g_simulator.isAvailable()); + for (auto& procAddr : procAddrs) { + g_simulator.excludeAddress(NetworkAddress(procAddr.ip, procAddr.port, true, false)); + } + for (auto& procRecord : procArray) { + procRecord->excluded = true; + TraceEvent("RemoveAndKill").detail("Step", "ExcludeAddress").detail("ProcessAddress", procRecord->address).detail("Process", describe(*procRecord)).detail("failed", procRecord->failed).detail("rebooting", procRecord->rebooting).detail("ClusterAvailable", g_simulator.isAvailable()); + } + return procArray; + } + + virtual std::vector includeAddresses(std::set const& procAddrs) + { + // Get the updated list of processes which may have changed due to reboots, deletes, etc + std::vector procArray = getProcesses(procAddrs); + + // Include all of the excluded machines because the first command of the next section is includeall + TraceEvent("RemoveAndKill").detail("Step", "include addresses").detail("AddrTotal", procAddrs.size()).detail("ProcTotal", procArray.size()).detail("Addresses", describe(procAddrs)).detail("ClusterAvailable", g_simulator.isAvailable()); + for (auto& procAddr : procAddrs) { + g_simulator.includeAddress(NetworkAddress(procAddr.ip, procAddr.port, true, false)); + } + for (auto& procRecord : procArray) { + // Only change the exclusion member, if not failed since it will require a reboot to revive it + if (!procRecord->failed) + procRecord->excluded = false; + TraceEvent("RemoveAndKill").detail("Step", "IncludeAddress").detail("ProcessAddress", procRecord->address).detail("Process", describe(*procRecord)).detail("failed", procRecord->failed).detail("rebooting", procRecord->rebooting).detail("ClusterAvailable", g_simulator.isAvailable()); + } + return procArray; + } + virtual std::vector protectServers(std::set const& killAddrs) { std::vector processes; std::set processAddrs; std::vector killableAddrs; - std::vector killProcesses, killableProcesses, processesLeft, processesDead; + std::vector killProcArray, killableProcesses, processesLeft, processesDead; // Get the list of processes matching network address for (auto processInfo : getServers()) { @@ -199,7 +237,7 @@ struct RemoveServersSafelyWorkload : TestWorkload { else if (killAddrs.find(processNet) == killAddrs.end()) processesLeft.push_back(processInfo); else - killProcesses.push_back(processInfo); + killProcArray.push_back(processInfo); } // Identify the largest set of processes which can be killed @@ -207,22 +245,22 @@ struct RemoveServersSafelyWorkload : TestWorkload { bool bCanKillProcess; ISimulator::ProcessInfo* randomProcess; auto deadProcess = processesDead.back(); - for (int killsLeft = killProcesses.size(); killsLeft > 0; killsLeft --) + for (int killsLeft = killProcArray.size(); killsLeft > 0; killsLeft --) { // Select a random kill process randomIndex = g_random->randomInt(0, killsLeft); - randomProcess = killProcesses[randomIndex]; + randomProcess = killProcArray[randomIndex]; processesDead.push_back(randomProcess); - killProcesses[randomIndex] = killProcesses.back(); - killProcesses.pop_back(); + killProcArray[randomIndex] = killProcArray.back(); + killProcArray.pop_back(); // Add all of the remaining processes the leftover array - processesLeft.insert(processesLeft.end(), killProcesses.begin(), killProcesses.end()); + processesLeft.insert(processesLeft.end(), killProcArray.begin(), killProcArray.end()); // Check if we can kill the added process bCanKillProcess = g_simulator.canKillProcesses(processesLeft, processesDead, ISimulator::KillInstantly, NULL); // Remove the added processes - processesLeft.resize(processesLeft.size() - killProcesses.size()); + processesLeft.resize(processesLeft.size() - killProcArray.size()); if (bCanKillProcess) { killableProcesses.push_back(randomProcess); @@ -247,94 +285,133 @@ struct RemoveServersSafelyWorkload : TestWorkload { // Removing the first set of machines might legitimately bring the database down, so a timeout is not an error state std::vector firstCoordinators; - state std::vector killProcesses; + state std::vector killProcArray; + state bool bClearedFirst; - TraceEvent("RemoveAndKill").detail("Step", "exclude first list").detail("toKill1", describe(toKill1)).detail("KillTotal", toKill1.size()) - .detail("ClusterAvailable", g_simulator.isAvailable()); + TraceEvent("RemoveAndKill").detail("Step", "exclude list first").detail("toKill", describe(toKill1)).detail("KillTotal", toKill1.size()).detail("ClusterAvailable", g_simulator.isAvailable()); + self->excludeAddresses(toKill1); - killProcesses = self->getProcesses(toKill1); - TraceEvent("RemoveAndKill").detail("Step", "mark first processes excluded").detail("Addresses", describe(toKill1)) - .detail("AddressTotal", toKill1.size()).detail("Processes", killProcesses.size()) - .detail("ClusterAvailable", g_simulator.isAvailable()); - for (auto& killProcess : killProcesses) { - killProcess->excluded = true; - g_simulator.excludeAddress(killProcess->address); - TraceEvent("RemoveAndKill").detail("Step", "MarkProcessFirst").detail("Process", describe(*killProcess)); - } + Optional result = wait( timeout( removeAndKill( self, cx, toKill1, NULL), self->kill1Timeout ) ); - Optional result = wait( timeout( removeAndKill( self, cx, toKill1), self->kill1Timeout ) ); + bClearedFirst = result.present(); - TraceEvent("RemoveAndKill").detail("Step", "first exclusion result").detail("result", result.present() ? "succeeded" : "failed"); - killProcesses = self->getProcesses(toKill1); - TraceEvent("RemoveAndKill").detail("Step", "include first processes").detail("toKill1", describe(toKill1)) - .detail("KillTotal", toKill1.size()).detail("Processes", killProcesses.size()); - for (auto& killProcess : killProcesses) { - g_simulator.includeAddress(killProcess->address); - killProcess->excluded = false; + TraceEvent("RemoveAndKill").detail("Step", "excluded list first").detail("excluderesult", bClearedFirst ? "succeeded" : "failed").detail("KillTotal", toKill1.size()).detail("Processes", killProcArray.size()).detail("toKill1", describe(toKill1)).detail("ClusterAvailable", g_simulator.isAvailable()); + + bClearedFirst=false; + // Include the servers, if unable to exclude + if (!bClearedFirst) { + // Get the updated list of processes which may have changed due to reboots, deletes, etc + TraceEvent("RemoveAndKill").detail("Step", "include all first").detail("KillTotal", toKill1.size()).detail("toKill", describe(toKill1)).detail("ClusterAvailable", g_simulator.isAvailable()); + Void _ = wait( includeServers( cx, vector(1) ) ); + self->includeAddresses(toKill1); + TraceEvent("RemoveAndKill").detail("Step", "included all first").detail("KillTotal", toKill1.size()).detail("toKill", describe(toKill1)).detail("ClusterAvailable", g_simulator.isAvailable()); } - killProcesses = self->protectServers(toKill2); + // Get the list of protected servers + killProcArray = self->protectServers(toKill2); // Update the kill networks to the killable processes - toKill2 = self->getNetworks(killProcesses); + toKill2 = self->getNetworks(killProcArray); - TraceEvent("RemoveAndKill").detail("Step", "Mark second processes excluded").detail("toKill2", describe(toKill2)) - .detail("KillTotal", toKill2.size()).detail("Processes", killProcesses.size()); - for (auto& killProcess : killProcesses) { - killProcess->excluded = true; - g_simulator.excludeAddress(killProcess->address); - TraceEvent("RemoveAndKill").detail("Step", "MarkProcessSecond").detail("Processes", killProcesses.size()).detail("Process", describe(*killProcess)); - } + TraceEvent("RemoveAndKill").detail("Step", "exclude list second").detail("KillTotal", toKill2.size()).detail("toKill", describe(toKill2)).detail("ClusterAvailable", g_simulator.isAvailable()); + self->excludeAddresses(toKill2); // The second set of machines is selected so that we can always make progress without it, even after the permitted number of other permanent failures // so we expect to succeed after a finite amount of time state Future disabler = disableConnectionFailuresAfter( self->kill2Timeout/2, "RemoveServersSafely" ); TraceEvent("RemoveAndKill").detail("Step", "exclude second list").detail("toKill2", describe(toKill2)).detail("KillTotal", toKill2.size()) - .detail("Processes", killProcesses.size()).detail("ClusterAvailable", g_simulator.isAvailable()); - Void _ = wait( reportErrors( timeoutError( removeAndKill( self, cx, toKill2), self->kill2Timeout ), "RemoveServersSafelyError", UID() ) ); + .detail("Processes", killProcArray.size()).detail("ClusterAvailable", g_simulator.isAvailable()); + Void _ = wait( reportErrors( timeoutError( removeAndKill( self, cx, toKill2, bClearedFirst ? &toKill1 : NULL), self->kill2Timeout ), "RemoveServersSafelyError", UID() ) ); - - TraceEvent("RemoveAndKill").detail("Step", "excluded second list").detail("KillTotal", toKill2.size()).detail("Excluded", killProcesses.size()) - .detail("ClusterAvailable", g_simulator.isAvailable()); + TraceEvent("RemoveAndKill").detail("Step", "excluded second list").detail("KillTotal", toKill1.size()).detail("toKill", describe(toKill2)).detail("ClusterAvailable", g_simulator.isAvailable()); // Reinclude all of the machine, if buggified if (BUGGIFY) { - TraceEvent("RemoveAndKill").detail("Step", "final include all").detail("ClusterAvailable", g_simulator.isAvailable()); + // Get the updated list of processes which may have changed due to reboots, deletes, etc + TraceEvent("RemoveAndKill").detail("Step", "include all second").detail("KillTotal", toKill1.size()).detail("toKill", describe(toKill2)).detail("ClusterAvailable", g_simulator.isAvailable()); Void _ = wait( includeServers( cx, vector(1) ) ); - for (auto& killProcess : killProcesses) { - g_simulator.includeAddress(killProcess->address); - killProcess->excluded = false; - } - TraceEvent("RemoveAndKill").detail("Step", "final included all").detail("ClusterAvailable", g_simulator.isAvailable()); + self->includeAddresses(toKill2); + TraceEvent("RemoveAndKill").detail("Step", "included all second").detail("KillTotal", toKill1.size()).detail("toKill", describe(toKill2)).detail("ClusterAvailable", g_simulator.isAvailable()); } return Void(); } - ACTOR static Future removeAndKill( RemoveServersSafelyWorkload* self, Database cx, std::set toKill) + virtual std::vector killAddresses(std::set const& killAddrs) { - // First clear the exclusion list and exclude the given list - TraceEvent("RemoveAndKill").detail("Step", "include all").detail("ClusterAvailable", g_simulator.isAvailable()); - Void _ = wait( includeServers( cx, vector(1) ) ); - TraceEvent("RemoveAndKill").detail("Step", "included all").detail("ClusterAvailable", g_simulator.isAvailable()); + UID functionId = g_nondeterministic_random->randomUniqueID(); + bool removeViaClear = !BUGGIFY; + std::vector killProcArray; + std::vector toKillArray; - state std::vector killProcesses; + std::copy(killAddrs.begin(), killAddrs.end(), std::back_inserter(toKillArray)); + killProcArray = getProcesses(killAddrs); + + // Reboot and delete or kill the servers + if( killProcesses ) { + TraceEvent("RemoveAndKill", functionId).detail("Step", removeViaClear ? "ClearProcesses" : "IgnoreProcesses").detail("Addresses", describe(killAddrs)) + .detail("Processes", killProcArray.size()).detail("ClusterAvailable", g_simulator.isAvailable()).detail("RemoveViaClear", removeViaClear); + for (auto& killProcess : killProcArray) { + if (g_simulator.protectedAddresses.count(killProcess->address)) + TraceEvent("RemoveAndKill", functionId).detail("Step", "NoKill Process").detail("Process", describe(*killProcess)).detail("failed", killProcess->failed).detail("rebooting", killProcess->rebooting).detail("ClusterAvailable", g_simulator.isAvailable()).detail("Protected", g_simulator.protectedAddresses.count(killProcess->address)); + else if (removeViaClear) { + g_simulator.rebootProcess( killProcess, ISimulator::RebootProcessAndDelete); + TraceEvent("RemoveAndKill", functionId).detail("Step", "Clear Process").detail("Process", describe(*killProcess)).detail("failed", killProcess->failed).detail("rebooting", killProcess->rebooting).detail("ClusterAvailable", g_simulator.isAvailable()).detail("Protected", g_simulator.protectedAddresses.count(killProcess->address)); + } +/* + else { + g_simulator.killProcess( killProcess, ISimulator::KillInstantly ); + TraceEvent("RemoveAndKill", functionId).detail("Step", "Kill Process").detail("Process", describe(*killProcess)).detail("failed", killProcess->failed).detail("rebooting", killProcess->rebooting).detail("ClusterAvailable", g_simulator.isAvailable()).detail("Protected", g_simulator.protectedAddresses.count(killProcess->address)); + } +*/ + } + } + else { + std::set>> zoneIds; + bool killedMachine; + for (auto& killProcess : killProcArray) { + zoneIds.insert(killProcess->locality.zoneId()); + } + TraceEvent("RemoveAndKill", functionId).detail("Step", removeViaClear ? "ClearMachines" : "KillMachines").detail("Addresses", describe(killAddrs)).detail("Processes", killProcArray.size()).detail("Zones", zoneIds.size()).detail("ClusterAvailable", g_simulator.isAvailable()); + for (auto& zoneId : zoneIds) { + killedMachine = g_simulator.killMachine( zoneId, removeViaClear ? ISimulator::RebootAndDelete : ISimulator::KillInstantly, removeViaClear); + TraceEvent(killedMachine ? SevInfo : SevWarn, "RemoveAndKill").detail("Step", removeViaClear ? "Clear Machine" : "Kill Machine").detailext("ZoneId", zoneId).detail(removeViaClear ? "Cleared" : "Killed", killedMachine).detail("ClusterAvailable", g_simulator.isAvailable()); + } + } + + return killProcArray; + } + + ACTOR static Future removeAndKill( RemoveServersSafelyWorkload* self, Database cx, std::set toKill, std::set* pIncAddrs) + { + state UID functionId = g_nondeterministic_random->randomUniqueID(); + + // First clear the exclusion list and exclude the given list + TraceEvent("RemoveAndKill", functionId).detail("Step", "include all").detail("ClusterAvailable", g_simulator.isAvailable()); + Void _ = wait( includeServers( cx, vector(1) ) ); + TraceEvent("RemoveAndKill", functionId).detail("Step", "included all").detail("ClusterAvailable", g_simulator.isAvailable()); + // Reinclude the addresses that were excluded, if present + if (pIncAddrs) { + self->includeAddresses(*pIncAddrs); + } + + state std::vector killProcArray; state std::vector toKillArray; std::copy(toKill.begin(), toKill.end(), std::back_inserter(toKillArray)); - killProcesses = self->getProcesses(toKill); + killProcArray = self->getProcesses(toKill); - TraceEvent("RemoveAndKill").detail("Step", "Activate Server Exclusion").detail("toKill", describe(toKill)).detail("Addresses", describe(toKillArray)).detail("ClusterAvailable", g_simulator.isAvailable()); + TraceEvent("RemoveAndKill", functionId).detail("Step", "Activate Server Exclusion").detail("KillAddrs", toKill.size()).detail("KillProcs", killProcArray.size()).detail("MissingProcs", toKill.size()!=killProcArray.size()).detail("toKill", describe(toKill)).detail("Addresses", describe(toKillArray)).detail("ClusterAvailable", g_simulator.isAvailable()); Void _ = wait( excludeServers( cx, toKillArray ) ); // We need to skip at least the quorum change if there's nothing to kill, because there might not be enough servers left // alive to do a coordinators auto (?) if (toKill.size()) { // Wait for removal to be safe - TraceEvent("RemoveAndKill").detail("Step", "Wait For Server Exclusion").detail("Addresses", describe(toKill)).detail("ClusterAvailable", g_simulator.isAvailable()); + TraceEvent("RemoveAndKill", functionId).detail("Step", "Wait For Server Exclusion").detail("Addresses", describe(toKill)).detail("ClusterAvailable", g_simulator.isAvailable()); Void _ = wait( waitForExcludedServers( cx, toKillArray ) ); - TraceEvent("RemoveAndKill").detail("Step", "coordinators auto").detail("desiredCoordinators", g_simulator.desiredCoordinators).detail("ClusterAvailable", g_simulator.isAvailable()); + TraceEvent("RemoveAndKill", functionId).detail("Step", "coordinators auto").detail("desiredCoordinators", g_simulator.desiredCoordinators).detail("ClusterAvailable", g_simulator.isAvailable()); // Setup the coordinators BEFORE the exclusion // Otherwise, we may end up with NotEnoughMachinesForCoordinators @@ -349,38 +426,14 @@ struct RemoveServersSafelyWorkload : TestWorkload { break; } - // Reboot and delete or kill the servers - if( self->killProcesses ) { - TraceEvent("RemoveAndKill").detail("Step", removeClear ? "ClearProcesses" : "KillProcesses").detail("Addresses", describe(toKill)) - .detail("Processes", killProcesses.size()).detail("ClusterAvailable", g_simulator.isAvailable()); - for (auto& killProcess : killProcesses) { - TraceEvent("RemoveAndKill").detail("Step", removeClear ? "Clear Process" : "Kill Process").detail("Process", describe(*killProcess)).detail("ClusterAvailable", g_simulator.isAvailable()).detail("Protected", g_simulator.protectedAddresses.count(killProcess->address)); -// ASSERT(g_simulator.protectedAddresses.count(killProcess->address) == 0); - if (removeClear) - g_simulator.rebootProcess( killProcess, ISimulator::RebootProcessAndDelete); - else - g_simulator.killProcess( killProcess, ISimulator::KillInstantly ); - } - } - else { - std::set>> zoneIds; - bool killedMachine; - for (auto& killProcess : killProcesses) { - zoneIds.insert(killProcess->locality.zoneId()); - } - TraceEvent("RemoveAndKill").detail("Step", removeClear ? "ClearMachines" : "KillMachines").detail("Addresses", describe(toKill)).detail("Processes", killProcesses.size()).detail("Zones", zoneIds.size()).detail("ClusterAvailable", g_simulator.isAvailable()); - for (auto& zoneId : zoneIds) { - killedMachine = g_simulator.killMachine( zoneId, removeClear ? ISimulator::RebootAndDelete : ISimulator::KillInstantly, removeClear ? true : false ); - TraceEvent(killedMachine ? SevInfo : SevWarn, "RemoveAndKill").detail("Step", removeClear ? "Clear Machine" : "Kill Machine").detailext("ZoneId", zoneId).detail(removeClear ? "Cleared" : "Killed", killedMachine).detail("ClusterAvailable", g_simulator.isAvailable()); - } - } + self->killAddresses(toKill); } else { - TraceEvent("RemoveAndKill").detail("Step", "nothing to clear").detail("ClusterAvailable", g_simulator.isAvailable()); + TraceEvent("RemoveAndKill", functionId).detail("Step", "nothing to clear").detail("ClusterAvailable", g_simulator.isAvailable()); } - TraceEvent("RemoveAndKill").detail("Step", "done").detail("ClusterAvailable", g_simulator.isAvailable()); + TraceEvent("RemoveAndKill", functionId).detail("Step", "done").detail("ClusterAvailable", g_simulator.isAvailable()); return Void(); } From 6b2f1abbf8c9b514b7a0ad92a29c2143bc41ecaa Mon Sep 17 00:00:00 2001 From: Yichi Chiang Date: Mon, 28 Aug 2017 16:02:31 -0700 Subject: [PATCH 12/30] release 5.0.4 preparation --- packaging/msi/FDBInstaller.wxs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/packaging/msi/FDBInstaller.wxs b/packaging/msi/FDBInstaller.wxs index 16f98314d1..4b364a242e 100755 --- a/packaging/msi/FDBInstaller.wxs +++ b/packaging/msi/FDBInstaller.wxs @@ -32,7 +32,7 @@ Date: Mon, 28 Aug 2017 17:16:46 -0700 Subject: [PATCH 13/30] fix: Set lock aware at the transaction level for latency probe to avoid having to fill the shard cache every time. --- fdbserver/Status.actor.cpp | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/fdbserver/Status.actor.cpp b/fdbserver/Status.actor.cpp index 4345f5ec77..f3a11cfd1a 100644 --- a/fdbserver/Status.actor.cpp +++ b/fdbserver/Status.actor.cpp @@ -947,6 +947,7 @@ ACTOR static Future doGrvProbe(Transaction *tr, OptionalsetOption(FDBTransactionOptions::LOCK_AWARE); if(priority.present()) { tr->setOption(priority.get()); } @@ -969,6 +970,7 @@ ACTOR static Future doReadProbe(Future grvProbe, Transaction *tr state double start = timer_monotonic(); loop { + tr->setOption(FDBTransactionOptions::LOCK_AWARE); try { Optional > _ = wait(tr->get(LiteralStringRef("\xff/StatusJsonTestKey62793"))); return timer_monotonic() - start; @@ -993,6 +995,7 @@ ACTOR static Future doCommitProbe(Future grvProbe, Transaction * loop { try { + tr->setOption(FDBTransactionOptions::LOCK_AWARE); tr->setOption(FDBTransactionOptions::PRIORITY_SYSTEM_IMMEDIATE); tr->makeSelfConflicting(); Void _ = wait(tr->commit()); @@ -1022,9 +1025,7 @@ ACTOR static Future doProbe(Future probe, int timeoutSeconds, cons return Void(); } -ACTOR static Future latencyProbeFetcher(Reference> db, StatusArray *messages, std::set *incomplete_reasons) { - Database cx = openDBOnServer(db, TaskDefaultEndpoint, true, true); // Open a new database connection that is lock-aware - +ACTOR static Future latencyProbeFetcher(Database cx, StatusArray *messages, std::set *incomplete_reasons) { state Transaction trImmediate(cx); state Transaction trDefault(cx); state Transaction trBatch(cx); @@ -1777,9 +1778,7 @@ ACTOR Future clusterGetStatus( if (configuration.present()){ // Do the latency probe by itself to avoid interference from other status activities - state Future latencyProbe = latencyProbeFetcher(db, &messages, &status_incomplete_reasons); - - StatusObject latencyProbeResults = wait(latencyProbe); + StatusObject latencyProbeResults = wait(latencyProbeFetcher(cx, &messages, &status_incomplete_reasons)); statusObj["database_available"] = latencyProbeResults.count("immediate_priority_transaction_start_seconds") && latencyProbeResults.count("read_seconds") && latencyProbeResults.count("commit_seconds"); if (!latencyProbeResults.empty()) { From 512f02bb8e0ff485957939dbc38c19f57e4b40d6 Mon Sep 17 00:00:00 2001 From: Yichi Chiang Date: Tue, 29 Aug 2017 10:12:00 -0700 Subject: [PATCH 14/30] Update GUID for release-5.0.4 --- packaging/msi/FDBInstaller.wxs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/packaging/msi/FDBInstaller.wxs b/packaging/msi/FDBInstaller.wxs index 4b364a242e..ce6c16edd8 100755 --- a/packaging/msi/FDBInstaller.wxs +++ b/packaging/msi/FDBInstaller.wxs @@ -32,7 +32,7 @@ Date: Tue, 29 Aug 2017 11:00:44 -0700 Subject: [PATCH 15/30] Add trace event for rebooting process during simulation for consistency check --- fdbserver/workloads/ConsistencyCheck.actor.cpp | 1 + 1 file changed, 1 insertion(+) diff --git a/fdbserver/workloads/ConsistencyCheck.actor.cpp b/fdbserver/workloads/ConsistencyCheck.actor.cpp index 9f04f566e8..2fa1141c51 100644 --- a/fdbserver/workloads/ConsistencyCheck.actor.cpp +++ b/fdbserver/workloads/ConsistencyCheck.actor.cpp @@ -1087,6 +1087,7 @@ struct ConsistencyCheckWorkload : TestWorkload if(!statefulProcesses[itr->first.address()].count(id)) { TraceEvent("ConsistencyCheck_ExtraDataStore").detail("Address", itr->first.address()).detail("DataStoreID", id); if(g_network->isSimulated() && !g_simulator.speedUpSimulation) { + TraceEvent("ConsistencyCheck_RebootProcess").detail("Address", itr->first.address()).detail("DataStoreID", id); g_simulator.rebootProcess(g_simulator.getProcessByAddress(itr->first.address()), ISimulator::RebootProcess); } From 6020d708637418750075f845274162626620b292 Mon Sep 17 00:00:00 2001 From: Alvin Moore Date: Tue, 29 Aug 2017 11:41:27 -0700 Subject: [PATCH 16/30] Added trace event to track reboots initiated by ConsistencyCheck workload in simulation --- fdbserver/workloads/ConsistencyCheck.actor.cpp | 1 + 1 file changed, 1 insertion(+) diff --git a/fdbserver/workloads/ConsistencyCheck.actor.cpp b/fdbserver/workloads/ConsistencyCheck.actor.cpp index fae6f05c0c..68e3103af7 100644 --- a/fdbserver/workloads/ConsistencyCheck.actor.cpp +++ b/fdbserver/workloads/ConsistencyCheck.actor.cpp @@ -1099,6 +1099,7 @@ struct ConsistencyCheckWorkload : TestWorkload if(!statefulProcesses[itr->first.address()].count(id)) { TraceEvent("ConsistencyCheck_ExtraDataStore").detail("Address", itr->first.address()).detail("DataStoreID", id); if(g_network->isSimulated() && !g_simulator.speedUpSimulation) { + TraceEvent("ConsistencyCheck_RebootProcess").detail("Address", itr->first.address()).detail("DataStoreID", id); g_simulator.rebootProcess(g_simulator.getProcessByAddress(itr->first.address()), ISimulator::RebootProcess); } From 6eb980db2fd93917dbd26c4ae488c52965754732 Mon Sep 17 00:00:00 2001 From: Yichi Chiang Date: Tue, 29 Aug 2017 14:09:23 -0700 Subject: [PATCH 17/30] Update version.target to 5.0.5 as next patch version --- versions.target | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/versions.target b/versions.target index b5f2321bcb..e8cbee8e07 100644 --- a/versions.target +++ b/versions.target @@ -1,7 +1,7 @@ - 5.0.4 + 5.0.5 5.0 From d9f2c858a0c194bccd3e08d2b3e8c4b27b391507 Mon Sep 17 00:00:00 2001 From: "A.J. Beamon" Date: Wed, 30 Aug 2017 08:34:43 -0700 Subject: [PATCH 18/30] Rollback version for docs change --- versions.target | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/versions.target b/versions.target index e8cbee8e07..b5f2321bcb 100644 --- a/versions.target +++ b/versions.target @@ -1,7 +1,7 @@ - 5.0.5 + 5.0.4 5.0 From 45962378c53a823463f024905531cadf24b61507 Mon Sep 17 00:00:00 2001 From: "A.J. Beamon" Date: Wed, 30 Aug 2017 12:06:45 -0700 Subject: [PATCH 19/30] Restore version to 5.0.5 --- versions.target | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/versions.target b/versions.target index b5f2321bcb..e8cbee8e07 100644 --- a/versions.target +++ b/versions.target @@ -1,7 +1,7 @@ - 5.0.4 + 5.0.5 5.0 From 963e1c3f31dc6e5b1c8e1a5ea26632ddff164acb Mon Sep 17 00:00:00 2001 From: Evan Tschannen Date: Wed, 30 Aug 2017 12:58:46 -0700 Subject: [PATCH 20/30] fix: we need to reboot the process even if it will result in too many files, because the check will not succeed without it --- fdbserver/workloads/ConsistencyCheck.actor.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fdbserver/workloads/ConsistencyCheck.actor.cpp b/fdbserver/workloads/ConsistencyCheck.actor.cpp index 2fa1141c51..53b6af4da3 100644 --- a/fdbserver/workloads/ConsistencyCheck.actor.cpp +++ b/fdbserver/workloads/ConsistencyCheck.actor.cpp @@ -1086,7 +1086,7 @@ struct ConsistencyCheckWorkload : TestWorkload for(auto id : stores.get()) { if(!statefulProcesses[itr->first.address()].count(id)) { TraceEvent("ConsistencyCheck_ExtraDataStore").detail("Address", itr->first.address()).detail("DataStoreID", id); - if(g_network->isSimulated() && !g_simulator.speedUpSimulation) { + if(g_network->isSimulated()) { TraceEvent("ConsistencyCheck_RebootProcess").detail("Address", itr->first.address()).detail("DataStoreID", id); g_simulator.rebootProcess(g_simulator.getProcessByAddress(itr->first.address()), ISimulator::RebootProcess); } From 6e9de8f35a209c2d7ec5aef0003daec7cbdf9915 Mon Sep 17 00:00:00 2001 From: Stephen Atherton Date: Thu, 31 Aug 2017 00:11:18 -0700 Subject: [PATCH 21/30] Bug fix. eraseDirectoryRecursive() on MacOS used to do nothing at all, but now it erases directories recursively. The Linux version was modified to be simpler and use a version of the FTW API that also works on MacOS. --- fdbrpc/Platform.cpp | 18 +++--------------- 1 file changed, 3 insertions(+), 15 deletions(-) diff --git a/fdbrpc/Platform.cpp b/fdbrpc/Platform.cpp index d04781bd5f..1ca8597f85 100644 --- a/fdbrpc/Platform.cpp +++ b/fdbrpc/Platform.cpp @@ -89,15 +89,11 @@ void eraseDirectoryRecursive( std::string const& dir ) { INJECT_FAULT( platform_error, "eraseDirectoryRecursive" ); #ifdef _WIN32 system( ("rd /s /q \"" + dir + "\"").c_str() ); -#elif defined(__linux__) +#elif defined(__linux__) || defined(__APPLE__) int error = nftw(dir.c_str(), - [](const char *fpath, const struct stat *sb, int typeflag, - struct FTW *ftwbuf) -> int { - if (remove(fpath)) - return FTW_STOP; - return FTW_CONTINUE; - }, 64, FTW_DEPTH | FTW_PHYS | FTW_ACTIONRETVAL); + [](const char *fpath, const struct stat *sb, int typeflag, struct FTW *ftwbuf) -> int { return remove(fpath); } + , 64, FTW_DEPTH | FTW_PHYS); /* Looks like calling code expects this to continue silently if the directory we're deleting doesn't exist in the first place */ @@ -105,14 +101,6 @@ void eraseDirectoryRecursive( std::string const& dir ) { TraceEvent(SevError, "nftw").detail("Directory", dir).GetLastError(); throw platform_error(); } -#elif defined(__APPLE__) - // const char* argv[2]; - // argv[0] = dir.c_str(); - // argv[1] = NULL; - // FTS* fts = fts_open(argv, FTS_PHYSICAL | FTS_SEEDOT | FTS_NOSTAT, NULL); - // while (FTSENT* ent = fts_read(fts)) { - // if (ent->fts_info - // } #else #error Port me! #endif From cc24072a5d2ac2445f4e285a2f73b51dd6381807 Mon Sep 17 00:00:00 2001 From: "A.J. Beamon" Date: Thu, 31 Aug 2017 16:23:55 -0700 Subject: [PATCH 22/30] Add the multi version API to the list of APIs to choose in the APICorrectness tester. Support for the multi-version client already existed. --- fdbserver/workloads/ApiCorrectness.actor.cpp | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/fdbserver/workloads/ApiCorrectness.actor.cpp b/fdbserver/workloads/ApiCorrectness.actor.cpp index 9912e99c0d..a97660d753 100644 --- a/fdbserver/workloads/ApiCorrectness.actor.cpp +++ b/fdbserver/workloads/ApiCorrectness.actor.cpp @@ -132,11 +132,12 @@ public: } ACTOR Future performSetup(Database cx, ApiCorrectnessWorkload *self) { - //Choose a random transaction type (NativeAPI, ReadYourWrites, ThreadSafe) + //Choose a random transaction type (NativeAPI, ReadYourWrites, ThreadSafe, MultiVersion) std::vector types; types.push_back(NATIVE); types.push_back(READ_YOUR_WRITES); types.push_back(THREAD_SAFE); + types.push_back(MULTI_VERSION); Void _ = wait(self->chooseTransactionFactory(cx, types)); From f19deec5d76b44f33a56e75da298a1d8106eb3d1 Mon Sep 17 00:00:00 2001 From: Ben Collins Date: Fri, 1 Sep 2017 09:15:59 -0700 Subject: [PATCH 23/30] Remove a few more outdated items --- .../cie/foundationdb/test/AsListTest.java | 88 ------ .../apple/cie/foundationdb/test/OSTest.java | 38 --- .../apple/cie/foundationdb/test/TestApp.java | 91 ------ layers/directory/directory.py | 260 ------------------ layers/directory/dirtest2.py | 79 ------ layers/directory/subspace.py | 58 ---- 6 files changed, 614 deletions(-) delete mode 100644 bindings/java/src-completable/test/com/apple/cie/foundationdb/test/AsListTest.java delete mode 100644 bindings/java/src-completable/test/com/apple/cie/foundationdb/test/OSTest.java delete mode 100644 bindings/java/src-completable/test/com/apple/cie/foundationdb/test/TestApp.java delete mode 100644 layers/directory/directory.py delete mode 100644 layers/directory/dirtest2.py delete mode 100644 layers/directory/subspace.py diff --git a/bindings/java/src-completable/test/com/apple/cie/foundationdb/test/AsListTest.java b/bindings/java/src-completable/test/com/apple/cie/foundationdb/test/AsListTest.java deleted file mode 100644 index a0c28147fb..0000000000 --- a/bindings/java/src-completable/test/com/apple/cie/foundationdb/test/AsListTest.java +++ /dev/null @@ -1,88 +0,0 @@ -/* - * AsListTest.java - * - * This source file is part of the FoundationDB open source project - * - * Copyright 2013-2018 Apple Inc. and the FoundationDB project authors - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package com.apple.cie.foundationdb.test; - -import com.apple.cie.foundationdb.Database; -import com.apple.cie.foundationdb.FDB; -import com.apple.cie.foundationdb.LocalityUtil; -import com.apple.cie.foundationdb.Transaction; -import com.apple.cie.foundationdb.async.AsyncUtil; - -import java.util.function.Function; -import java.util.concurrent.CompletableFuture; - -public class AsListTest { - /** - * When the database contains keys a, b, c, d, e -- this should return 5 items, - * a bug made the addition of the clear into the result returning 0 items. - */ - public static void main(String[] args) { - FDB fdb = FDB.selectAPIVersion(500); - Database database = fdb.open("T:\\circus\\tags\\RebarCluster-bbc\\cluster_id.txt"); - database.options().setLocationCacheSize(42); - Transaction tr = database.createTransaction(); - //tr.clear("g".getBytes()); - /*tr.clear("bbb".getBytes()); - AsyncIterable query = tr.getRange( - KeySelector.firstGreaterOrEqual("a".getBytes()), - KeySelector.firstGreaterOrEqual("e".getBytes()), - Integer.MAX_VALUE); - //List list = query.asList().get(); - //System.out.println("List size: " + list.size()); -*/ - String[] keyAddresses = LocalityUtil.getAddressesForKey(tr, "a".getBytes()).join(); - for(String s : keyAddresses) { - System.out.println(" @ " + s); - } - - @SuppressWarnings("unused") - CompletableFuture i = AsyncUtil.applySafely(new Function>() { - @Override - public CompletableFuture apply(Exception o) { - return CompletableFuture.completedFuture(3); - } - }, new RuntimeException()); - - CompletableFuture f = null; - - @SuppressWarnings({ "unused", "null" }) - CompletableFuture g = f.thenComposeAsync(new Function>() { - @Override - public CompletableFuture apply(Integer o) { - return CompletableFuture.completedFuture(o.toString()); - } - }); - - @SuppressWarnings({ "unused", "null" }) - CompletableFuture g2 = f.thenComposeAsync(new Function>() { - @Override - public CompletableFuture apply(Integer o) { - return CompletableFuture.completedFuture(o.toString()); - } - }).exceptionally(new Function() { - @Override - public String apply(Throwable o) { - // TODO Auto-generated method stub - return null; - } - }); - } -} diff --git a/bindings/java/src-completable/test/com/apple/cie/foundationdb/test/OSTest.java b/bindings/java/src-completable/test/com/apple/cie/foundationdb/test/OSTest.java deleted file mode 100644 index 36577119dc..0000000000 --- a/bindings/java/src-completable/test/com/apple/cie/foundationdb/test/OSTest.java +++ /dev/null @@ -1,38 +0,0 @@ -/* - * OSTest.java - * - * This source file is part of the FoundationDB open source project - * - * Copyright 2013-2018 Apple Inc. and the FoundationDB project authors - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package com.apple.cie.foundationdb.test; - -import java.io.InputStream; - -public class OSTest { - - /** - * @param args - */ - public static void main(String[] args) { - System.out.println("OS name: " + System.getProperty("os.name")); - System.out.println("OS arch: " + System.getProperty("os.arch")); - - InputStream stream = OSTest.class.getResourceAsStream("/lib/linux/amd64/libfdb_java.so"); - System.out.println("Stream: " + stream); - } - -} diff --git a/bindings/java/src-completable/test/com/apple/cie/foundationdb/test/TestApp.java b/bindings/java/src-completable/test/com/apple/cie/foundationdb/test/TestApp.java deleted file mode 100644 index 0d077fab27..0000000000 --- a/bindings/java/src-completable/test/com/apple/cie/foundationdb/test/TestApp.java +++ /dev/null @@ -1,91 +0,0 @@ -/* - * TestApp.java - * - * This source file is part of the FoundationDB open source project - * - * Copyright 2013-2018 Apple Inc. and the FoundationDB project authors - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package com.apple.cie.foundationdb.test; - -import java.util.concurrent.CompletableFuture; - -import com.apple.cie.foundationdb.Cluster; -import com.apple.cie.foundationdb.Database; -import com.apple.cie.foundationdb.FDB; -import com.apple.cie.foundationdb.Transaction; - -public class TestApp { - - public static void main(String[] args) throws Exception { - try { - Cluster cluster = FDB.selectAPIVersion(500).createCluster(); - System.out.println("I now have the cluster"); - Database db = cluster.openDatabase(); - - Transaction tr = db.createTransaction(); - System.out.println("TR: " + tr); - - byte[] appleValue = tr.get("apple".getBytes()).get(); - System.out.println("Apple: " + (appleValue == null ? null : new String(appleValue))); - - tr.set("apple".getBytes(), "crunchy".getBytes()); - System.out.println("Attempting to commit apple/crunchy..."); - tr.commit().get(); // FIXME: this is not an ok use of the API - tr = db.createTransaction(); - - long topTime = 0, getTime = 0, bottomTime = 0; - - for(int i = 0; i < 1000; i++) { - long a = System.currentTimeMillis(); - - final byte[] key = ("apple" + i).getBytes(); - tr = db.createTransaction(); - CompletableFuture future = tr.get(key); - - long b = System.currentTimeMillis(); - - future.get(); - - long c = System.currentTimeMillis(); - - tr.set(key, ("Apple" + i).getBytes()); - final CompletableFuture commit = tr.commit(); - - long d = System.currentTimeMillis(); - - commit.whenCompleteAsync((v, error) -> { - if(error != null) { - error.printStackTrace(); - } - }); - - topTime += b - a; - getTime += c - b; - bottomTime += d - c; - } - - System.out.println(" Top: " + topTime); - System.out.println(" Get: " + getTime); - System.out.println(" Bottom: " + bottomTime); - - tr.dispose(); - db.dispose(); - cluster.dispose(); - } catch(Throwable t) { - t.printStackTrace(); - } - } -} diff --git a/layers/directory/directory.py b/layers/directory/directory.py deleted file mode 100644 index 996c3530cc..0000000000 --- a/layers/directory/directory.py +++ /dev/null @@ -1,260 +0,0 @@ -# -# directory.py -# -# This source file is part of the FoundationDB open source project -# -# Copyright 2013-2018 Apple Inc. and the FoundationDB project authors -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# - -from subspace import Subspace -import fdb, fdb.tuple -import random, struct - -fdb.api_version(100) - -#TODO: Error class - -class HighContentionAllocator (object): - def __init__(self, subspace): - self.counters = subspace[0] - self.recent = subspace[1] - - @fdb.transactional - def allocate( self, tr ): - """Returns a byte string which - (1) has never and will never be returned by another call to HighContentionAllocator.allocate() on the same subspace - (2) is nearly as short as possible given the above""" - - [(start, count)] = [ (self.counters.unpack(k)[0],struct.unpack("= window: - # Advance the window - del tr[ self.counters : self.counters[start].key()+chr(0) ] - start += window - del tr[ self.recent : self.recent[start] ] - window = self._window_size(start) - - # Increment the allocation count for the current window - tr.add( self.counters[start], struct.pack(" Date: Fri, 1 Sep 2017 09:34:53 -0700 Subject: [PATCH 24/30] Fixed OS X compilation build warnings due to printf field type specifiers --- bindings/flow/fdb_flow.actor.cpp | 2 +- fdbrpc/FlowTests.actor.cpp | 18 +++++++++--------- fdbrpc/dsltest.actor.cpp | 2 +- 3 files changed, 11 insertions(+), 11 deletions(-) diff --git a/bindings/flow/fdb_flow.actor.cpp b/bindings/flow/fdb_flow.actor.cpp index 1d4ac67365..816c13fb72 100644 --- a/bindings/flow/fdb_flow.actor.cpp +++ b/bindings/flow/fdb_flow.actor.cpp @@ -41,7 +41,7 @@ ACTOR Future _test() { // tr->setVersion(1); Version ver = wait( tr->getReadVersion() ); - printf("%ld\n", ver); + printf("%lld\n", ver); state std::vector< Future > versions; diff --git a/fdbrpc/FlowTests.actor.cpp b/fdbrpc/FlowTests.actor.cpp index 08a15e09eb..093bc756aa 100644 --- a/fdbrpc/FlowTests.actor.cpp +++ b/fdbrpc/FlowTests.actor.cpp @@ -344,10 +344,10 @@ TEST_CASE("flow/flow/quorum") vector> fs; vector> qs; for (auto& p : ps) fs.push_back(p.getFuture()); - + for (int i = 0; i <= ps.size(); i++) qs.push_back( quorum(fs, i) ); - + for (int i = 0; i < ps.size(); i++) { ASSERT(qs[i].isReady()); ASSERT(!qs[i + 1].isReady()); @@ -357,7 +357,7 @@ TEST_CASE("flow/flow/quorum") return Void(); } -TEST_CASE("flow/flow/trivial futures") +TEST_CASE("flow/flow/trivial futures") { Future invalid; ASSERT(!invalid.isValid()); @@ -499,7 +499,7 @@ TEST_CASE("flow/flow/promisestream callbacks") onReady(p.getFuture(), [&result](int x) { result = x; }, [&result](Error e){ result = -1; }); ASSERT(result == 0); - + p = PromiseStream(); ASSERT(result == -1); @@ -989,7 +989,7 @@ TEST_CASE("flow/flow/perf/actor patterns") ASSERT(out2[i].isReady()); } printf("2xcheeseActor(chooseTwoActor(cheeseActor(fifo), never)): %0.2f M/sec\n", N / 1e6 / (timer() - start)); - printf("sizeof(CheeseWaitActorActor) == %d\n", sizeof(CheeseWaitActorActor)); + printf("sizeof(CheeseWaitActorActor) == %lu\n", sizeof(CheeseWaitActorActor)); } { @@ -1140,11 +1140,11 @@ TEST_CASE("flow/flow/YieldedAsyncMap/cancel2") state Future y2 = yam.onChange(2); auto* pyam = &yam; - uncancellable(trigger( + uncancellable(trigger( [pyam](){ printf("Triggered\n"); - pyam->triggerAll(); - }, + pyam->triggerAll(); + }, delay(1))); Void _ = wait(y1); @@ -1191,4 +1191,4 @@ TEST_CASE("fdbrpc/flow/wait_expression_after_cancel") f.cancel(); ASSERT( a == 1 ); return Void(); -} \ No newline at end of file +} diff --git a/fdbrpc/dsltest.actor.cpp b/fdbrpc/dsltest.actor.cpp index bdfa2d0ee1..5275dc078c 100644 --- a/fdbrpc/dsltest.actor.cpp +++ b/fdbrpc/dsltest.actor.cpp @@ -79,7 +79,7 @@ bool testFuzzActor( Future(*actor)(FutureStream const&, PromiseStream< } } if (outCount+1 != expectedOutput.size()) { - printf("\tERROR: %s output length incorrect: %d != expected %d\n", desc, outCount+1, expectedOutput.size()); + printf("\tERROR: %s output length incorrect: %d != expected %lu\n", desc, outCount+1, expectedOutput.size()); if (trial) printf("\t\tResult was inconsistent between runs!\n"); ok = false; //return false; From 0994587573917419c00cfc46a7badf07f197f3c7 Mon Sep 17 00:00:00 2001 From: Alvin Moore Date: Fri, 1 Sep 2017 09:35:56 -0700 Subject: [PATCH 25/30] Fixed OS X compilation build warnings due to printf field specifiers --- bindings/flow/fdb_flow.actor.cpp | 2 +- fdbrpc/FlowTests.actor.cpp | 18 +++++++++--------- fdbrpc/dsltest.actor.cpp | 2 +- 3 files changed, 11 insertions(+), 11 deletions(-) diff --git a/bindings/flow/fdb_flow.actor.cpp b/bindings/flow/fdb_flow.actor.cpp index 1d4ac67365..816c13fb72 100644 --- a/bindings/flow/fdb_flow.actor.cpp +++ b/bindings/flow/fdb_flow.actor.cpp @@ -41,7 +41,7 @@ ACTOR Future _test() { // tr->setVersion(1); Version ver = wait( tr->getReadVersion() ); - printf("%ld\n", ver); + printf("%lld\n", ver); state std::vector< Future > versions; diff --git a/fdbrpc/FlowTests.actor.cpp b/fdbrpc/FlowTests.actor.cpp index 08a15e09eb..093bc756aa 100644 --- a/fdbrpc/FlowTests.actor.cpp +++ b/fdbrpc/FlowTests.actor.cpp @@ -344,10 +344,10 @@ TEST_CASE("flow/flow/quorum") vector> fs; vector> qs; for (auto& p : ps) fs.push_back(p.getFuture()); - + for (int i = 0; i <= ps.size(); i++) qs.push_back( quorum(fs, i) ); - + for (int i = 0; i < ps.size(); i++) { ASSERT(qs[i].isReady()); ASSERT(!qs[i + 1].isReady()); @@ -357,7 +357,7 @@ TEST_CASE("flow/flow/quorum") return Void(); } -TEST_CASE("flow/flow/trivial futures") +TEST_CASE("flow/flow/trivial futures") { Future invalid; ASSERT(!invalid.isValid()); @@ -499,7 +499,7 @@ TEST_CASE("flow/flow/promisestream callbacks") onReady(p.getFuture(), [&result](int x) { result = x; }, [&result](Error e){ result = -1; }); ASSERT(result == 0); - + p = PromiseStream(); ASSERT(result == -1); @@ -989,7 +989,7 @@ TEST_CASE("flow/flow/perf/actor patterns") ASSERT(out2[i].isReady()); } printf("2xcheeseActor(chooseTwoActor(cheeseActor(fifo), never)): %0.2f M/sec\n", N / 1e6 / (timer() - start)); - printf("sizeof(CheeseWaitActorActor) == %d\n", sizeof(CheeseWaitActorActor)); + printf("sizeof(CheeseWaitActorActor) == %lu\n", sizeof(CheeseWaitActorActor)); } { @@ -1140,11 +1140,11 @@ TEST_CASE("flow/flow/YieldedAsyncMap/cancel2") state Future y2 = yam.onChange(2); auto* pyam = &yam; - uncancellable(trigger( + uncancellable(trigger( [pyam](){ printf("Triggered\n"); - pyam->triggerAll(); - }, + pyam->triggerAll(); + }, delay(1))); Void _ = wait(y1); @@ -1191,4 +1191,4 @@ TEST_CASE("fdbrpc/flow/wait_expression_after_cancel") f.cancel(); ASSERT( a == 1 ); return Void(); -} \ No newline at end of file +} diff --git a/fdbrpc/dsltest.actor.cpp b/fdbrpc/dsltest.actor.cpp index bdfa2d0ee1..5275dc078c 100644 --- a/fdbrpc/dsltest.actor.cpp +++ b/fdbrpc/dsltest.actor.cpp @@ -79,7 +79,7 @@ bool testFuzzActor( Future(*actor)(FutureStream const&, PromiseStream< } } if (outCount+1 != expectedOutput.size()) { - printf("\tERROR: %s output length incorrect: %d != expected %d\n", desc, outCount+1, expectedOutput.size()); + printf("\tERROR: %s output length incorrect: %d != expected %lu\n", desc, outCount+1, expectedOutput.size()); if (trial) printf("\t\tResult was inconsistent between runs!\n"); ok = false; //return false; From fe9abbfac97835ff7529ec47824685c72e816c2b Mon Sep 17 00:00:00 2001 From: Alec Grieser Date: Fri, 1 Sep 2017 09:54:03 -0700 Subject: [PATCH 26/30] revert 'Remove unused code' for function referenced in fdbrpc --- flow/Net2.actor.cpp | 119 ++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 119 insertions(+) diff --git a/flow/Net2.actor.cpp b/flow/Net2.actor.cpp index 282363844a..ddbfb7ae08 100644 --- a/flow/Net2.actor.cpp +++ b/flow/Net2.actor.cpp @@ -982,3 +982,122 @@ void startThreadF( F && func ) { Thing* t = new Thing(std::move(func)); startThread(Thing::start, t); } + +void net2_test() { + /*printf("ThreadSafeQueue test\n"); + printf(" Interface: "); + ThreadSafeQueue tq; + ASSERT( tq.canSleep() == true ); + + ASSERT( tq.push( 1 ) == true ) ; + ASSERT( tq.push( 2 ) == false ); + ASSERT( tq.push( 3 ) == false ); + + ASSERT( tq.pop().get() == 1 ); + ASSERT( tq.pop().get() == 2 ); + ASSERT( tq.push( 4 ) == false ); + ASSERT( tq.pop().get() == 3 ); + ASSERT( tq.pop().get() == 4 ); + ASSERT( !tq.pop().present() ); + printf("OK\n"); + + printf("Threaded: "); + Event finished, finished2; + int thread1Iterations = 1000000, thread2Iterations = 100000; + + if (thread1Iterations) + startThreadF([&](){ + printf("Thread1\n"); + for(int i=0; i i = tq.pop(); + if (i.present()) { + int v = i.get(); + ++c; + if (mx[v>>20] != v) + printf("Wrong value dequeued!\n"); + ASSERT( mx[v>>20] == v ); + mx[v>>20] = v + 1; + } else { + ++p; + _mm_pause(); + } + if ((c&3)==0) tq.canSleep(); + } + printf("%d %d %x %x %s\n", c, p, mx[0], mx[1], mx[0]==thread1Iterations && mx[1]==(1<<20)+thread2Iterations ? "OK" : "FAIL"); + + finished.block(); + finished2.block(); + + + g_network = newNet2(NetworkAddress::parse("127.0.0.1:12345")); // for promise serialization below + + Endpoint destination; + + printf(" Used: %lld\n", FastAllocator<4096>::getMemoryUsed()); + + char junk[100]; + + double before = timer(); + + vector reqs; + reqs.reserve( 10000 ); + + int totalBytes = 0; + for(int j=0; j<1000; j++) { + UnsentPacketQueue unsent; + ReliablePacketList reliable; + + reqs.resize(10000); + for(int i=0; i<10000; i++) { + TestGVR &req = reqs[i]; + req.key = LiteralStringRef("Foobar"); + + SerializeSource what(req); + + SendBuffer* pb = unsent.getWriteBuffer(); + ReliablePacket* rp = new ReliablePacket; // 0 + + PacketWriter wr(pb,rp,AssumeVersion(currentProtocolVersion)); + //BinaryWriter wr; + SplitBuffer packetLen; + uint32_t len = 0; + wr.writeAhead(sizeof(len), &packetLen); + wr << destination.token; + //req.reply.getEndpoint(); + what.serializePacketWriter(wr); + //wr.serializeBytes(junk, 43); + + unsent.setWriteBuffer(wr.finish()); + len = wr.size() - sizeof(len); + packetLen.write(&len, sizeof(len)); + + //totalBytes += wr.getLength(); + totalBytes += wr.size(); + + if (rp) reliable.insert(rp); + } + reqs.clear(); + unsent.discardAll(); + reliable.discardAll(); + } + + printf("SimSend x 1Kx10K: %0.2f sec\n", timer()-before); + printf(" Bytes: %d\n", totalBytes); + printf(" Used: %lld\n", FastAllocator<4096>::getMemoryUsed()); + */ +}; From 6f6dbe4b331d82863ca8e282705ed595c994948a Mon Sep 17 00:00:00 2001 From: Evan Tschannen Date: Fri, 1 Sep 2017 11:14:18 -0700 Subject: [PATCH 27/30] fix: load balance will still use second requests when client locality is present --- fdbrpc/LoadBalance.actor.h | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/fdbrpc/LoadBalance.actor.h b/fdbrpc/LoadBalance.actor.h index 555ef11919..a61419ae73 100644 --- a/fdbrpc/LoadBalance.actor.h +++ b/fdbrpc/LoadBalance.actor.h @@ -204,6 +204,22 @@ Future< REPLY_TYPE(Request) > loadBalance( } } } + if( nextMetric > 1e8 ) { + for(int i=alternatives->countBest(); isize(); i++) { + RequestStream const* thisStream = &alternatives->get( i, channel ); + if (!IFailureMonitor::failureMonitor().getState( thisStream->getEndpoint() ).failed) { + auto& qd = model->getMeasurement(thisStream->getEndpoint().token.first()); + double thisMetric = qd.smoothOutstanding.smoothTotal(); + double thisTime = qd.latency; + + if( thisMetric < nextMetric ) { + nextAlt = i; + nextMetric = thisMetric; + nextTime = thisTime; + } + } + } + } if(nextTime < 1e9) { if(bestTime > FLOW_KNOBS->INSTANT_SECOND_REQUEST_MULTIPLIER*(model->secondMultiplier*(nextTime) + FLOW_KNOBS->BASE_SECOND_REQUEST_TIME)) { From 560e172c46f7558aea53ecc2d4dfaa7e2be2d634 Mon Sep 17 00:00:00 2001 From: "A.J. Beamon" Date: Fri, 1 Sep 2017 12:36:24 -0700 Subject: [PATCH 28/30] tests: Update binding single key get range performance test to specify a limit of 2 and use exact mode. --- bindings/c/test/performance_test.c | 6 +++--- .../com/apple/cie/foundationdb/test/PerformanceTester.java | 2 +- tests/python_tests/python_performance.py | 2 +- 3 files changed, 5 insertions(+), 5 deletions(-) diff --git a/bindings/c/test/performance_test.c b/bindings/c/test/performance_test.c index aef1926f77..5af01b0678 100644 --- a/bindings/c/test/performance_test.c +++ b/bindings/c/test/performance_test.c @@ -500,8 +500,8 @@ struct RunResult getSingleKeyRange(struct ResultSet *rs, FDBTransaction *tr) { FDBFuture *f = fdb_transaction_get_range(tr, keys[key], keySize, 1, 0, keys[key + 1], keySize, 1, 0, - 0, 0, - FDB_STREAMING_MODE_WANT_ALL, 1, 0, 0); + 2, 0, + FDB_STREAMING_MODE_EXACT, 1, 0, 0); e = maybeLogError(fdb_future_block_until_ready(f), "waiting for single key range", rs); if(e) { @@ -516,7 +516,7 @@ struct RunResult getSingleKeyRange(struct ResultSet *rs, FDBTransaction *tr) { } if(outCount != 1) { - logError(4100, "non-1 number of keys returned in single key range read", rs); + logError(4100, "more than one key returned in single key range read", rs); fdb_future_destroy(f); return RES(0, 4100); } diff --git a/bindings/java/src-completable/test/com/apple/cie/foundationdb/test/PerformanceTester.java b/bindings/java/src-completable/test/com/apple/cie/foundationdb/test/PerformanceTester.java index 562f6831c9..97ae836ae1 100644 --- a/bindings/java/src-completable/test/com/apple/cie/foundationdb/test/PerformanceTester.java +++ b/bindings/java/src-completable/test/com/apple/cie/foundationdb/test/PerformanceTester.java @@ -352,7 +352,7 @@ public class PerformanceTester extends AbstractTester { long start = System.nanoTime(); for (int i = 0; i < count; i++) { int keyIndex = randomKeyIndex(); - tr.getRange(key(keyIndex), key(keyIndex + 1)).asList().join(); + tr.getRange(key(keyIndex), key(keyIndex + 1), 2).asList().join(); } long end = System.nanoTime(); diff --git a/tests/python_tests/python_performance.py b/tests/python_tests/python_performance.py index 8595f5b816..0f4115c6e7 100755 --- a/tests/python_tests/python_performance.py +++ b/tests/python_tests/python_performance.py @@ -275,7 +275,7 @@ class PythonPerformance(PythonTest): for i in range(count): index = random.randint(0, self.key_count) - list(tr[self.key(index):self.key(index+1)]) + list(tr.get_range(self.key(index), self.key(index+1), limit=2)) return count / (time.time() - s) From dc1f7ca6b77a5e0cfcd5cb349fd66f28c2f3a313 Mon Sep 17 00:00:00 2001 From: Evan Tschannen Date: Fri, 1 Sep 2017 12:53:01 -0700 Subject: [PATCH 29/30] testers now use client locality load balancing --- fdbclient/NativeAPI.actor.cpp | 4 ++-- fdbclient/NativeAPI.h | 2 +- fdbserver/TesterInterface.h | 4 ++-- fdbserver/fdbserver.actor.cpp | 25 +++++++++++++------------ fdbserver/tester.actor.cpp | 24 ++++++++++++------------ fdbserver/worker.actor.cpp | 4 ++-- 6 files changed, 32 insertions(+), 31 deletions(-) diff --git a/fdbclient/NativeAPI.actor.cpp b/fdbclient/NativeAPI.actor.cpp index 2f2fefee6d..5e23b34b16 100644 --- a/fdbclient/NativeAPI.actor.cpp +++ b/fdbclient/NativeAPI.actor.cpp @@ -754,8 +754,8 @@ Reference Cluster::createCluster(std::string connFileName, int apiVersi return Reference(new Cluster( rccf, apiVersion)); } -Future Cluster::createDatabase( Standalone dbName ) { - return DatabaseContext::createDatabase( clusterInterface, Reference::addRef( this ), dbName, LocalityData() ); +Future Cluster::createDatabase( Standalone dbName, LocalityData locality ) { + return DatabaseContext::createDatabase( clusterInterface, Reference::addRef( this ), dbName, locality ); } Future Cluster::onConnected() { diff --git a/fdbclient/NativeAPI.h b/fdbclient/NativeAPI.h index 6f5de002cf..fe33950553 100644 --- a/fdbclient/NativeAPI.h +++ b/fdbclient/NativeAPI.h @@ -110,7 +110,7 @@ public: static Reference createCluster(std::string connFileName, int apiVersion); // See DatabaseContext::createDatabase - Future createDatabase( Standalone dbName ); + Future createDatabase( Standalone dbName, LocalityData locality = LocalityData() ); void setOption(FDBClusterOptions::Option option, Optional value); diff --git a/fdbserver/TesterInterface.h b/fdbserver/TesterInterface.h index 09241e0a87..ca4d99b8e6 100644 --- a/fdbserver/TesterInterface.h +++ b/fdbserver/TesterInterface.h @@ -83,11 +83,11 @@ struct TesterInterface { } }; -Future testerServerCore( TesterInterface const& interf, Reference const& ccf, Reference> const& ); +Future testerServerCore( TesterInterface const& interf, Reference const& ccf, Reference> const&, LocalityData const& ); enum test_location_t { TEST_HERE, TEST_ON_SERVERS, TEST_ON_TESTERS }; enum test_type_t { TEST_TYPE_FROM_FILE, TEST_TYPE_CONSISTENCY_CHECK }; -Future runTests( Reference const& connFile, test_type_t const& whatToRun, test_location_t const& whereToRun, int const& minTestersExpected, std::string const& fileName = std::string(), StringRef const& startingConfiguration = StringRef() ); +Future runTests( Reference const& connFile, test_type_t const& whatToRun, test_location_t const& whereToRun, int const& minTestersExpected, std::string const& fileName = std::string(), StringRef const& startingConfiguration = StringRef(), LocalityData const& locality = LocalityData() ); #endif diff --git a/fdbserver/fdbserver.actor.cpp b/fdbserver/fdbserver.actor.cpp index 32bb6d4cad..fcd2d96722 100644 --- a/fdbserver/fdbserver.actor.cpp +++ b/fdbserver/fdbserver.actor.cpp @@ -1528,6 +1528,17 @@ int main(int argc, char* argv[]) { Future> f; + Standalone machineId(getSharedMemoryMachineId().toString()); + + if (!localities.isPresent(LocalityData::keyZoneId)) + localities.set(LocalityData::keyZoneId, zoneId.present() ? zoneId : machineId); + + if (!localities.isPresent(LocalityData::keyMachineId)) + localities.set(LocalityData::keyMachineId, machineId); + + if (!localities.isPresent(LocalityData::keyDcId) && dcId.present()) + localities.set(LocalityData::keyDcId, dcId); + if (role == Simulation) { TraceEvent("Simulation").detail("TestFile", testFile); @@ -1574,16 +1585,6 @@ int main(int argc, char* argv[]) { vector> actors; actors.push_back( listenError ); - Standalone machineId(getSharedMemoryMachineId().toString()); - - if (!localities.isPresent(LocalityData::keyZoneId)) - localities.set(LocalityData::keyZoneId, zoneId.present() ? zoneId : machineId); - - if (!localities.isPresent(LocalityData::keyMachineId)) - localities.set(LocalityData::keyMachineId, machineId); - - if (!localities.isPresent(LocalityData::keyDcId) && dcId.present()) - localities.set(LocalityData::keyDcId, dcId); actors.push_back( fdbd(connectionFile, localities, processClass, dataFolder, dataFolder, storageMemLimit, metricsConnFile, metricsPrefix) ); //actors.push_back( recurring( []{}, .001 ) ); // for ASIO latency measurement @@ -1591,11 +1592,11 @@ int main(int argc, char* argv[]) { f = stopAfter( waitForAll(actors) ); g_network->run(); } else if (role == MultiTester) { - f = stopAfter( runTests( connectionFile, TEST_TYPE_FROM_FILE, testOnServers ? TEST_ON_SERVERS : TEST_ON_TESTERS, minTesterCount, testFile ) ); + f = stopAfter( runTests( connectionFile, TEST_TYPE_FROM_FILE, testOnServers ? TEST_ON_SERVERS : TEST_ON_TESTERS, minTesterCount, testFile, StringRef(), localities ) ); g_network->run(); } else if (role == Test || role == ConsistencyCheck) { auto m = startSystemMonitor(dataFolder, zoneId, zoneId); - f = stopAfter( runTests( connectionFile, role == ConsistencyCheck ? TEST_TYPE_CONSISTENCY_CHECK : TEST_TYPE_FROM_FILE, TEST_HERE, 1, testFile ) ); + f = stopAfter( runTests( connectionFile, role == ConsistencyCheck ? TEST_TYPE_CONSISTENCY_CHECK : TEST_TYPE_FROM_FILE, TEST_HERE, 1, testFile, StringRef(), localities ) ); g_network->run(); } else if (role == CreateTemplateDatabase) { createTemplateDatabase(); diff --git a/fdbserver/tester.actor.cpp b/fdbserver/tester.actor.cpp index 423dde19cd..358db2b293 100644 --- a/fdbserver/tester.actor.cpp +++ b/fdbserver/tester.actor.cpp @@ -484,7 +484,7 @@ ACTOR Future runWorkloadAsync( Database cx, WorkloadInterface workIface, T return Void(); } -ACTOR Future testerServerWorkload( WorkloadRequest work, Reference ccf, Reference> dbInfo ) { +ACTOR Future testerServerWorkload( WorkloadRequest work, Reference ccf, Reference> dbInfo, LocalityData locality ) { state WorkloadInterface workIface; state bool replied = false; state Database cx; @@ -501,7 +501,7 @@ ACTOR Future testerServerWorkload( WorkloadRequest work, Reference cluster = Cluster::createCluster(ccf->getFilename(), -1); - Database _cx = wait(cluster->createDatabase(database)); + Database _cx = wait(cluster->createDatabase(database, locality)); cx = _cx; Void _ = wait( delay(1.0) ); @@ -544,7 +544,7 @@ ACTOR Future testerServerWorkload( WorkloadRequest work, Reference testerServerCore( TesterInterface interf, Reference ccf, Reference> dbInfo ) { +ACTOR Future testerServerCore( TesterInterface interf, Reference ccf, Reference> dbInfo, LocalityData locality ) { state PromiseStream> addWorkload; state Future workerFatalError = actorCollection(addWorkload.getFuture()); @@ -552,7 +552,7 @@ ACTOR Future testerServerCore( TesterInterface interf, Reference readTests( ifstream& ifs ) { return result; } -ACTOR Future runTests( Reference>> cc, Reference>> ci, vector< TesterInterface > testers, vector tests, StringRef startingConfiguration ) { +ACTOR Future runTests( Reference>> cc, Reference>> ci, vector< TesterInterface > testers, vector tests, StringRef startingConfiguration, LocalityData locality ) { state Standalone database = LiteralStringRef("DB"); state Database cx; state Reference> dbInfo( new AsyncVar ); @@ -1016,7 +1016,7 @@ ACTOR Future runTests( Reference(), database, LocalityData() ) ); // FIXME: Locality! + Database _cx = wait( DatabaseContext::createDatabase( ci, Reference(), database, locality ) ); cx = _cx; } else database = LiteralStringRef(""); @@ -1071,7 +1071,7 @@ ACTOR Future runTests( Reference runTests( Reference>> cc, Reference>> ci, vector tests, test_location_t at, - int minTestersExpected, StringRef startingConfiguration ) { + int minTestersExpected, StringRef startingConfiguration, LocalityData locality ) { state int flags = at == TEST_ON_SERVERS ? 0 : GetWorkersRequest::FLAG_TESTER_CLASS; state Future testerTimeout = delay(60.0); // wait 60 sec for testers to show up state vector> workers; @@ -1097,12 +1097,12 @@ ACTOR Future runTests( Reference runTests( Reference connFile, test_type_t whatToRun, test_location_t at, - int minTestersExpected, std::string fileName, StringRef startingConfiguration ) { + int minTestersExpected, std::string fileName, StringRef startingConfiguration, LocalityData locality ) { state vector testSpecs; Reference>> cc( new AsyncVar> ); Reference>> ci( new AsyncVar> ); @@ -1147,10 +1147,10 @@ ACTOR Future runTests( Reference connFile, test_typ Reference> db( new AsyncVar ); vector iTesters(1); actors.push_back( reportErrors(monitorServerDBInfo( cc, Reference(), LocalityData(), db ), "monitorServerDBInfo") ); // FIXME: Locality - actors.push_back( reportErrors(testerServerCore( iTesters[0], connFile, db ), "testerServerCore") ); - tests = runTests( cc, ci, iTesters, testSpecs, startingConfiguration ); + actors.push_back( reportErrors(testerServerCore( iTesters[0], connFile, db, locality ), "testerServerCore") ); + tests = runTests( cc, ci, iTesters, testSpecs, startingConfiguration, locality ); } else { - tests = reportErrors(runTests(cc, ci, testSpecs, at, minTestersExpected, startingConfiguration), "runTests"); + tests = reportErrors(runTests(cc, ci, testSpecs, at, minTestersExpected, startingConfiguration, locality), "runTests"); } choose { diff --git a/fdbserver/worker.actor.cpp b/fdbserver/worker.actor.cpp index ba453a2f6d..775edac8e2 100644 --- a/fdbserver/worker.actor.cpp +++ b/fdbserver/worker.actor.cpp @@ -512,7 +512,7 @@ ACTOR Future workerServer( Reference connFile, Refe if( metricsConnFile.size() > 0) { try { state Reference cluster = Cluster::createCluster( metricsConnFile, Cluster::API_VERSION_LATEST ); - metricsLogger = runMetrics( cluster->createDatabase(LiteralStringRef("DB")), KeyRef(metricsPrefix) ); + metricsLogger = runMetrics( cluster->createDatabase(LiteralStringRef("DB"), locality), KeyRef(metricsPrefix) ); } catch(Error &e) { TraceEvent(SevWarnAlways, "TDMetricsBadClusterFile").error(e).detail("ConnFile", metricsConnFile); } @@ -526,7 +526,7 @@ ACTOR Future workerServer( Reference connFile, Refe errorForwarders.add( registrationClient( ccInterface, interf, processClass ) ); errorForwarders.add( waitFailureServer( interf.waitFailure.getFuture() ) ); errorForwarders.add( monitorServerDBInfo( ccInterface, connFile, locality, dbInfo ) ); - errorForwarders.add( testerServerCore( interf.testerInterface, connFile, dbInfo ) ); + errorForwarders.add( testerServerCore( interf.testerInterface, connFile, dbInfo, locality ) ); filesClosed.add(stopping.getFuture()); From 6e26ae2bb35ac7eb0474cfb284a92317b479d270 Mon Sep 17 00:00:00 2001 From: Evan Tschannen Date: Fri, 1 Sep 2017 15:45:27 -0700 Subject: [PATCH 30/30] added a new multi_dc configuration --- fdbcli/fdbcli.actor.cpp | 6 +++--- fdbclient/ManagementAPI.actor.cpp | 9 +++++++++ 2 files changed, 12 insertions(+), 3 deletions(-) diff --git a/fdbcli/fdbcli.actor.cpp b/fdbcli/fdbcli.actor.cpp index a0f35f5106..ca99631507 100644 --- a/fdbcli/fdbcli.actor.cpp +++ b/fdbcli/fdbcli.actor.cpp @@ -437,9 +437,9 @@ void initHelp() { "clear a range of keys from the database", "All keys between BEGINKEY (inclusive) and ENDKEY (exclusive) are cleared from the database. This command will succeed even if the specified range is empty, but may fail because of conflicts." ESCAPINGK); helpMap["configure"] = CommandHelp( - "configure [new] |logs=|resolvers=>*", + "configure [new] |logs=|resolvers=>*", "change database configuration", - "The `new' option, if present, initializes a new database with the given configuration rather than changing the configuration of an existing one. When used, both a redundancy mode and a storage engine must be specified.\n\nRedundancy mode:\n single - one copy of the data. Not fault tolerant.\n double - two copies of data (survive one failure).\n triple - three copies of data (survive two failures).\n three_data_hall - See the Admin Guide.\n three_datacenter - See the Admin Guide.\n\nStorage engine:\n ssd - B-Tree storage engine optimized for solid state disks.\n memory - Durable in-memory storage engine for small datasets.\n\nproxies=: Sets the desired number of proxies in the cluster. Must be at least 1, or set to -1 which restores the number of proxies to the default value.\n\nlogs=: Sets the desired number of log servers in the cluster. Must be at least 1, or set to -1 which restores the number of logs to the default value.\n\nresolvers=: Sets the desired number of resolvers in the cluster. Must be at least 1, or set to -1 which restores the number of resolvers to the default value.\n\nSee the FoundationDB Administration Guide for more information."); + "The `new' option, if present, initializes a new database with the given configuration rather than changing the configuration of an existing one. When used, both a redundancy mode and a storage engine must be specified.\n\nRedundancy mode:\n single - one copy of the data. Not fault tolerant.\n double - two copies of data (survive one failure).\n triple - three copies of data (survive two failures).\n three_data_hall - See the Admin Guide.\n three_datacenter - See the Admin Guide.\n multi_dc - See the Admin Guide.\n\nStorage engine:\n ssd - B-Tree storage engine optimized for solid state disks.\n memory - Durable in-memory storage engine for small datasets.\n\nproxies=: Sets the desired number of proxies in the cluster. Must be at least 1, or set to -1 which restores the number of proxies to the default value.\n\nlogs=: Sets the desired number of log servers in the cluster. Must be at least 1, or set to -1 which restores the number of logs to the default value.\n\nresolvers=: Sets the desired number of resolvers in the cluster. Must be at least 1, or set to -1 which restores the number of resolvers to the default value.\n\nSee the FoundationDB Administration Guide for more information."); helpMap["coordinators"] = CommandHelp( "coordinators auto|
+ [description=new_cluster_description]", "change cluster coordinators or description", @@ -1902,7 +1902,7 @@ void onoff_generator(const char* text, const char *line, std::vector& lc) { - const char* opts[] = {"new", "single", "double", "triple", "three_data_hall", "three_datacenter", "ssd", "ssd-1", "ssd-2", "memory", "proxies=", "logs=", "resolvers=", NULL}; + const char* opts[] = {"new", "single", "double", "triple", "three_data_hall", "three_datacenter", "multi_dc", "ssd", "ssd-1", "ssd-2", "memory", "proxies=", "logs=", "resolvers=", NULL}; array_generator(text, line, opts, lc); } diff --git a/fdbclient/ManagementAPI.actor.cpp b/fdbclient/ManagementAPI.actor.cpp index 7aeabaa445..892cac8368 100644 --- a/fdbclient/ManagementAPI.actor.cpp +++ b/fdbclient/ManagementAPI.actor.cpp @@ -123,6 +123,15 @@ std::map configForToken( std::string const& mode ) { tLogPolicy = IRepPolicyRef(new PolicyAcross(2, "data_hall", IRepPolicyRef(new PolicyAcross(2, "zoneid", IRepPolicyRef(new PolicyOne()))) )); + } else if(mode == "multi_dc") { + redundancy="6"; + log_replicas="4"; + storagePolicy = IRepPolicyRef(new PolicyAcross(3, "dcid", + IRepPolicyRef(new PolicyAcross(2, "zoneid", IRepPolicyRef(new PolicyOne()))) + )); + tLogPolicy = IRepPolicyRef(new PolicyAcross(2, "dcid", + IRepPolicyRef(new PolicyAcross(2, "zoneid", IRepPolicyRef(new PolicyOne()))) + )); } else redundancySpecified = false; if (redundancySpecified) {