From 7ac098dc0d79a17f74e01710519987586698dafe Mon Sep 17 00:00:00 2001 From: Ryan Worl Date: Mon, 25 Feb 2019 18:39:14 -0500 Subject: [PATCH 01/56] Add Versionstamp support to the Go Tuple layer --- bindings/bindingtester/known_testers.py | 2 +- bindings/go/src/_stacktester/stacktester.go | 26 ++++- bindings/go/src/fdb/tuple/tuple.go | 117 +++++++++++++++++++- 3 files changed, 139 insertions(+), 6 deletions(-) diff --git a/bindings/bindingtester/known_testers.py b/bindings/bindingtester/known_testers.py index 8abe8c5741..fee09f5adf 100644 --- a/bindings/bindingtester/known_testers.py +++ b/bindings/bindingtester/known_testers.py @@ -62,6 +62,6 @@ testers = { 'ruby': Tester('ruby', _absolute_path('ruby/tests/tester.rb'), 2040, 23, MAX_API_VERSION), 'java': Tester('java', _java_cmd + 'StackTester', 2040, 510, MAX_API_VERSION, types=ALL_TYPES), 'java_async': Tester('java', _java_cmd + 'AsyncStackTester', 2040, 510, MAX_API_VERSION, types=ALL_TYPES), - 'go': Tester('go', _absolute_path('go/build/bin/_stacktester'), 2040, 200, MAX_API_VERSION), + 'go': Tester('go', _absolute_path('go/build/bin/_stacktester'), 2040, 200, MAX_API_VERSION, types=ALL_TYPES), 'flow': Tester('flow', _absolute_path('flow/bin/fdb_flow_tester'), 63, 500, MAX_API_VERSION, directory_snapshot_ops_enabled=False), } diff --git a/bindings/go/src/_stacktester/stacktester.go b/bindings/go/src/_stacktester/stacktester.go index f76641629e..151f1e766c 100644 --- a/bindings/go/src/_stacktester/stacktester.go +++ b/bindings/go/src/_stacktester/stacktester.go @@ -25,8 +25,6 @@ import ( "encoding/binary" "encoding/hex" "fmt" - "github.com/apple/foundationdb/bindings/go/src/fdb" - "github.com/apple/foundationdb/bindings/go/src/fdb/tuple" "log" "math/big" "os" @@ -37,6 +35,9 @@ import ( "strings" "sync" "time" + + "github.com/apple/foundationdb/bindings/go/src/fdb" + "github.com/apple/foundationdb/bindings/go/src/fdb/tuple" ) const verbose bool = false @@ -661,6 +662,25 @@ func (sm *StackMachine) processInst(idx int, inst tuple.Tuple) { t = append(t, sm.waitAndPop().item) } sm.store(idx, []byte(t.Pack())) + case op == "TUPLE_PACK_VERSIONSTAMP": + var t tuple.Tuple + count := sm.waitAndPop().item.(int64) + for i := 0; i < int(count); i++ { + t = append(t, sm.waitAndPop().item) + } + + incomplete, err := t.HasIncompleteVersionstamp() + if incomplete == false { + sm.store(idx, []byte("ERROR: NONE")) + } else { + if err != nil { + sm.store(idx, []byte("ERROR: MULTIPLE")) + } else { + packed := t.Pack() + sm.store(idx, "OK") + sm.store(idx, packed) + } + } case op == "TUPLE_UNPACK": t, e := tuple.Unpack(fdb.Key(sm.waitAndPop().item.([]byte))) if e != nil { @@ -893,7 +913,7 @@ func main() { log.Fatal("API version not equal to value selected") } - db, e = fdb.OpenDatabase(clusterFile) + db, e = fdb.Open(clusterFile, []byte("DB")) if e != nil { log.Fatal(e) } diff --git a/bindings/go/src/fdb/tuple/tuple.go b/bindings/go/src/fdb/tuple/tuple.go index afd959420f..0e0cc732bd 100644 --- a/bindings/go/src/fdb/tuple/tuple.go +++ b/bindings/go/src/fdb/tuple/tuple.go @@ -39,6 +39,7 @@ package tuple import ( "bytes" "encoding/binary" + "errors" "fmt" "math" "math/big" @@ -72,6 +73,37 @@ type Tuple []TupleElement // an instance of this type. type UUID [16]byte +// Versionstamp . +type Versionstamp struct { + TransactionVersion [10]byte + UserVersion uint16 +} + +var incompleteTransactionVersion = [10]byte{0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF} + +const versionstampLength = 13 + +// IncompleteVersionstamp . +func IncompleteVersionstamp(userVersion uint16) Versionstamp { + return Versionstamp{ + TransactionVersion: incompleteTransactionVersion, + UserVersion: userVersion, + } +} + +// Bytes . +func (v Versionstamp) Bytes() []byte { + var scratch [12]byte + + copy(scratch[:], v.TransactionVersion[:]) + + binary.BigEndian.PutUint16(scratch[10:], v.UserVersion) + + fmt.Println(scratch) + + return scratch[:] +} + // Type codes: These prefix the different elements in a packed Tuple // to indicate what type they are. const nilCode = 0x00 @@ -86,6 +118,7 @@ const doubleCode = 0x21 const falseCode = 0x26 const trueCode = 0x27 const uuidCode = 0x30 +const versionstampCode = 0x33 var sizeLimits = []uint64{ 1<<(0*8) - 1, @@ -122,7 +155,15 @@ func adjustFloatBytes(b []byte, encode bool) { } type packer struct { - buf []byte + versionstampPos int32 + buf []byte +} + +func newPacker() *packer { + return &packer{ + versionstampPos: -1, + buf: make([]byte, 0, 64), + } } func (p *packer) putByte(b byte) { @@ -249,6 +290,18 @@ func (p *packer) encodeUUID(u UUID) { p.putBytes(u[:]) } +func (p *packer) encodeVersionstamp(v Versionstamp) { + p.putByte(versionstampCode) + + if p.versionstampPos != 0 && v.TransactionVersion == incompleteTransactionVersion { + panic(fmt.Sprintf("Tuple can only contain one unbound versionstamp")) + } else { + p.versionstampPos = int32(len(p.buf)) + } + + p.putBytes(v.Bytes()) +} + func (p *packer) encodeTuple(t Tuple, nested bool) { if nested { p.putByte(nestedCode) @@ -293,6 +346,8 @@ func (p *packer) encodeTuple(t Tuple, nested bool) { } case UUID: p.encodeUUID(e) + case Versionstamp: + p.encodeVersionstamp(e) default: panic(fmt.Sprintf("unencodable element at index %d (%v, type %T)", i, t[i], t[i])) } @@ -314,11 +369,50 @@ func (p *packer) encodeTuple(t Tuple, nested bool) { // call Pack when using a Tuple with a FoundationDB API function that requires a // key. func (t Tuple) Pack() []byte { - p := packer{buf: make([]byte, 0, 64)} + p := newPacker() p.encodeTuple(t, false) return p.buf } +// PackWithVersionstamp packs the specified tuple into a key for versionstamp operations +func (t Tuple) PackWithVersionstamp() ([]byte, error) { + hasVersionstamp, err := t.HasIncompleteVersionstamp() + if err != nil { + return nil, err + } + + p := newPacker() + p.encodeTuple(t, false) + + if hasVersionstamp { + var scratch [4]byte + binary.LittleEndian.PutUint32(scratch[:], uint32(p.versionstampPos)) + p.putBytes(scratch[:]) + } + + return p.buf, nil +} + +// HasIncompleteVersionstamp determines if there is at least one incomplete versionstamp in a tuple +func (t Tuple) HasIncompleteVersionstamp() (bool, error) { + incompleteCount := 0 + for _, el := range t { + switch e := el.(type) { + case Versionstamp: + if e.TransactionVersion == incompleteTransactionVersion { + incompleteCount++ + } + } + } + + var err error + if incompleteCount > 1 { + err = errors.New("Tuple can only contain one unbound versionstamp") + } + + return incompleteCount == 1, err +} + func findTerminator(b []byte) int { bp := b var length int @@ -438,6 +532,20 @@ func decodeUUID(b []byte) (UUID, int) { return u, 17 } +func decodeVersionstamp(b []byte) (Versionstamp, int) { + var transactionVersion [10]byte + var userVersion uint16 + + copy(transactionVersion[:], b[1:11]) + + userVersion = binary.BigEndian.Uint16(b[11:]) + + return Versionstamp{ + TransactionVersion: transactionVersion, + UserVersion: userVersion, + }, versionstampLength +} + func decodeTuple(b []byte, nested bool) (Tuple, int, error) { var t Tuple @@ -489,6 +597,11 @@ func decodeTuple(b []byte, nested bool) (Tuple, int, error) { return nil, i, fmt.Errorf("insufficient bytes to decode UUID starting at position %d of byte array for tuple", i) } el, off = decodeUUID(b[i:]) + case b[i] == versionstampCode: + if i+versionstampLength > len(b) { + return nil, i, fmt.Errorf("insufficient bytes to decode Versionstamp starting at position %d of byte array for tuple", i) + } + el, off = decodeVersionstamp(b[i:]) case b[i] == nestedCode: var err error el, off, err = decodeTuple(b[i+1:], true) From b2f26224b9a46a6f96a3bcccd6a7bbdee66ac198 Mon Sep 17 00:00:00 2001 From: Ryan Worl Date: Mon, 25 Feb 2019 18:41:57 -0500 Subject: [PATCH 02/56] Revert unintentional change back to old API --- bindings/go/src/_stacktester/stacktester.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/bindings/go/src/_stacktester/stacktester.go b/bindings/go/src/_stacktester/stacktester.go index 151f1e766c..6a1100ffe7 100644 --- a/bindings/go/src/_stacktester/stacktester.go +++ b/bindings/go/src/_stacktester/stacktester.go @@ -913,7 +913,7 @@ func main() { log.Fatal("API version not equal to value selected") } - db, e = fdb.Open(clusterFile, []byte("DB")) + db, e = fdb.OpenDatabase(clusterFile) if e != nil { log.Fatal(e) } From 292bb6ab0f564a4649f20b2e4d60ab5e918758ea Mon Sep 17 00:00:00 2001 From: Ryan Worl Date: Mon, 25 Feb 2019 18:57:28 -0500 Subject: [PATCH 03/56] Make `versionstampLength` constant equal Versionstamp actual length. --- bindings/go/src/fdb/tuple/tuple.go | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/bindings/go/src/fdb/tuple/tuple.go b/bindings/go/src/fdb/tuple/tuple.go index 0e0cc732bd..2c30705ba0 100644 --- a/bindings/go/src/fdb/tuple/tuple.go +++ b/bindings/go/src/fdb/tuple/tuple.go @@ -81,7 +81,7 @@ type Versionstamp struct { var incompleteTransactionVersion = [10]byte{0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF} -const versionstampLength = 13 +const versionstampLength = 12 // IncompleteVersionstamp . func IncompleteVersionstamp(userVersion uint16) Versionstamp { @@ -543,7 +543,7 @@ func decodeVersionstamp(b []byte) (Versionstamp, int) { return Versionstamp{ TransactionVersion: transactionVersion, UserVersion: userVersion, - }, versionstampLength + }, versionstampLength + 1 } func decodeTuple(b []byte, nested bool) (Tuple, int, error) { @@ -598,7 +598,7 @@ func decodeTuple(b []byte, nested bool) (Tuple, int, error) { } el, off = decodeUUID(b[i:]) case b[i] == versionstampCode: - if i+versionstampLength > len(b) { + if i+versionstampLength+1 > len(b) { return nil, i, fmt.Errorf("insufficient bytes to decode Versionstamp starting at position %d of byte array for tuple", i) } el, off = decodeVersionstamp(b[i:]) From 4dd04862c7037a5ba246eab3ea0aa4a40f644942 Mon Sep 17 00:00:00 2001 From: Ryan Worl Date: Mon, 25 Feb 2019 19:05:45 -0500 Subject: [PATCH 04/56] Flatten if statements --- bindings/go/src/_stacktester/stacktester.go | 12 +++++------- 1 file changed, 5 insertions(+), 7 deletions(-) diff --git a/bindings/go/src/_stacktester/stacktester.go b/bindings/go/src/_stacktester/stacktester.go index 6a1100ffe7..8ef2c3d78f 100644 --- a/bindings/go/src/_stacktester/stacktester.go +++ b/bindings/go/src/_stacktester/stacktester.go @@ -672,14 +672,12 @@ func (sm *StackMachine) processInst(idx int, inst tuple.Tuple) { incomplete, err := t.HasIncompleteVersionstamp() if incomplete == false { sm.store(idx, []byte("ERROR: NONE")) + } else if err != nil { + sm.store(idx, []byte("ERROR: MULTIPLE")) } else { - if err != nil { - sm.store(idx, []byte("ERROR: MULTIPLE")) - } else { - packed := t.Pack() - sm.store(idx, "OK") - sm.store(idx, packed) - } + packed := t.Pack() + sm.store(idx, "OK") + sm.store(idx, packed) } case op == "TUPLE_UNPACK": t, e := tuple.Unpack(fdb.Key(sm.waitAndPop().item.([]byte))) From 05d347e194d0de66f635b61b72477ead4b75c7da Mon Sep 17 00:00:00 2001 From: Ryan Worl Date: Mon, 25 Feb 2019 19:08:29 -0500 Subject: [PATCH 05/56] Push byte slice instead of string onto the stack --- bindings/go/src/_stacktester/stacktester.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/bindings/go/src/_stacktester/stacktester.go b/bindings/go/src/_stacktester/stacktester.go index 8ef2c3d78f..b2012546e7 100644 --- a/bindings/go/src/_stacktester/stacktester.go +++ b/bindings/go/src/_stacktester/stacktester.go @@ -676,7 +676,7 @@ func (sm *StackMachine) processInst(idx int, inst tuple.Tuple) { sm.store(idx, []byte("ERROR: MULTIPLE")) } else { packed := t.Pack() - sm.store(idx, "OK") + sm.store(idx, []byte("OK")) sm.store(idx, packed) } case op == "TUPLE_UNPACK": From 3da85f3acd084b1194bbec209cfa05dc5752b439 Mon Sep 17 00:00:00 2001 From: Evan Tschannen Date: Thu, 28 Feb 2019 17:45:00 -0800 Subject: [PATCH 06/56] implemented the \xff/metadataVersion key, which can be used by layers to help them cheaply cache metadata and know when their cache is invalid --- fdbclient/BackupAgentBase.actor.cpp | 2 + fdbclient/DatabaseBackupAgent.actor.cpp | 17 +++++- fdbclient/DatabaseContext.h | 3 + fdbclient/FileBackupAgent.actor.cpp | 1 + fdbclient/Knobs.cpp | 1 + fdbclient/Knobs.h | 1 + fdbclient/MasterProxyInterface.h | 3 +- fdbclient/NativeAPI.actor.cpp | 45 +++++++++++++- fdbclient/NativeAPI.actor.h | 1 + fdbclient/ReadYourWrites.actor.cpp | 21 +++++-- fdbclient/SystemData.cpp | 2 + fdbclient/SystemData.h | 2 + fdbserver/ApplyMetadataMutation.h | 23 +++++++- fdbserver/MasterProxyServer.actor.cpp | 68 +++++++++------------- fdbserver/masterserver.actor.cpp | 3 + fdbserver/workloads/VersionStamp.actor.cpp | 67 +++++++++++++++------ tests/fast/VersionStamp.txt | 3 +- 17 files changed, 196 insertions(+), 67 deletions(-) mode change 100755 => 100644 fdbclient/FileBackupAgent.actor.cpp diff --git a/fdbclient/BackupAgentBase.actor.cpp b/fdbclient/BackupAgentBase.actor.cpp index 6aa5bf8aae..14c17cf20b 100644 --- a/fdbclient/BackupAgentBase.actor.cpp +++ b/fdbclient/BackupAgentBase.actor.cpp @@ -586,6 +586,8 @@ ACTOR Future applyMutations(Database cx, Key uid, Key addPrefix, Key remov state Future error = actorCollection( addActor.getFuture() ); state int maxBytes = CLIENT_KNOBS->APPLY_MIN_LOCK_BYTES; + keyVersion->insert(metadataVersionKey, 0); + try { loop { if(beginVersion >= *endVersion) { diff --git a/fdbclient/DatabaseBackupAgent.actor.cpp b/fdbclient/DatabaseBackupAgent.actor.cpp index 42389466b4..3450a2b061 100644 --- a/fdbclient/DatabaseBackupAgent.actor.cpp +++ b/fdbclient/DatabaseBackupAgent.actor.cpp @@ -1523,8 +1523,8 @@ namespace dbBackup { } } + state Reference srcTr2(new ReadYourWritesTransaction(taskBucket->src)); loop { - state Reference srcTr2(new ReadYourWritesTransaction(taskBucket->src)); try { srcTr2->setOption(FDBTransactionOptions::LOCK_AWARE); srcTr2->setOption(FDBTransactionOptions::ACCESS_SYSTEM_KEYS); @@ -1554,6 +1554,21 @@ namespace dbBackup { } } + state Reference srcTr3(new ReadYourWritesTransaction(taskBucket->src)); + loop { + try { + srcTr3->setOption(FDBTransactionOptions::LOCK_AWARE); + srcTr3->setOption(FDBTransactionOptions::ACCESS_SYSTEM_KEYS); + + srcTr3->atomicOp(metadataVersionKey, metadataVersionRequiredValue, MutationRef::SetVersionstampedValue); + + wait(srcTr3->commit()); + break; + } catch(Error &e) { + wait(srcTr3->onError(e)); + } + } + return Void(); } diff --git a/fdbclient/DatabaseContext.h b/fdbclient/DatabaseContext.h index 81e7df2c3d..9f0f32190b 100644 --- a/fdbclient/DatabaseContext.h +++ b/fdbclient/DatabaseContext.h @@ -159,6 +159,9 @@ public: Reference cluster; int apiVersion; + + int mvcInsertLocation; + std::vector>> metadataVersionCache; }; #endif diff --git a/fdbclient/FileBackupAgent.actor.cpp b/fdbclient/FileBackupAgent.actor.cpp old mode 100755 new mode 100644 index 8c7a281340..4e342cc7f5 --- a/fdbclient/FileBackupAgent.actor.cpp +++ b/fdbclient/FileBackupAgent.actor.cpp @@ -2382,6 +2382,7 @@ namespace fileBackup { state RestoreConfig restore(task); restore.stateEnum().set(tr, ERestoreState::COMPLETED); + tr->atomicOp(metadataVersionKey, metadataVersionRequiredValue, MutationRef::SetVersionstampedValue); // Clear the file map now since it could be huge. restore.fileSet().clear(tr); diff --git a/fdbclient/Knobs.cpp b/fdbclient/Knobs.cpp index 4a7b573a4b..0789d46ee6 100644 --- a/fdbclient/Knobs.cpp +++ b/fdbclient/Knobs.cpp @@ -58,6 +58,7 @@ ClientKnobs::ClientKnobs(bool randomize) { init( SYSTEM_KEY_SIZE_LIMIT, 3e4 ); init( VALUE_SIZE_LIMIT, 1e5 ); init( SPLIT_KEY_SIZE_LIMIT, KEY_SIZE_LIMIT/2 ); if( randomize && BUGGIFY ) SPLIT_KEY_SIZE_LIMIT = KEY_SIZE_LIMIT - serverKeysPrefixFor(UID()).size() - 1; + init( METADATA_VERSION_CACHE_SIZE, 1000 ); init( MAX_BATCH_SIZE, 20 ); if( randomize && BUGGIFY ) MAX_BATCH_SIZE = 1; // Note that SERVER_KNOBS->START_TRANSACTION_MAX_BUDGET_SIZE is set to match this value init( GRV_BATCH_TIMEOUT, 0.005 ); if( randomize && BUGGIFY ) GRV_BATCH_TIMEOUT = 0.1; diff --git a/fdbclient/Knobs.h b/fdbclient/Knobs.h index 4f405a0a3c..0de06e92a3 100644 --- a/fdbclient/Knobs.h +++ b/fdbclient/Knobs.h @@ -56,6 +56,7 @@ public: int64_t SYSTEM_KEY_SIZE_LIMIT; int64_t VALUE_SIZE_LIMIT; int64_t SPLIT_KEY_SIZE_LIMIT; + int METADATA_VERSION_CACHE_SIZE; int MAX_BATCH_SIZE; double GRV_BATCH_TIMEOUT; diff --git a/fdbclient/MasterProxyInterface.h b/fdbclient/MasterProxyInterface.h index 05a55fbbd2..802d4e529d 100644 --- a/fdbclient/MasterProxyInterface.h +++ b/fdbclient/MasterProxyInterface.h @@ -116,10 +116,11 @@ static inline int getBytes( CommitTransactionRequest const& r ) { struct GetReadVersionReply { Version version; bool locked; + Optional metadataVersion; template void serialize(Ar& ar) { - serializer(ar, version, locked); + serializer(ar, version, locked, metadataVersion); } }; diff --git a/fdbclient/NativeAPI.actor.cpp b/fdbclient/NativeAPI.actor.cpp index 188b41dbea..3a16bd076b 100644 --- a/fdbclient/NativeAPI.actor.cpp +++ b/fdbclient/NativeAPI.actor.cpp @@ -472,8 +472,9 @@ DatabaseContext::DatabaseContext( transactionReadVersions(0), transactionLogicalReads(0), transactionPhysicalReads(0), transactionCommittedMutations(0), transactionCommittedMutationBytes(0), transactionsCommitStarted(0), transactionsCommitCompleted(0), transactionsTooOld(0), transactionsFutureVersions(0), transactionsNotCommitted(0), transactionsMaybeCommitted(0), transactionsResourceConstrained(0), outstandingWatches(0), - latencies(1000), readLatencies(1000), commitLatencies(1000), GRVLatencies(1000), mutationsPerCommit(1000), bytesPerCommit(1000) + latencies(1000), readLatencies(1000), commitLatencies(1000), GRVLatencies(1000), mutationsPerCommit(1000), bytesPerCommit(1000), mvcInsertLocation(0) { + metadataVersionCache.resize(CLIENT_KNOBS->METADATA_VERSION_CACHE_SIZE); maxOutstandingWatches = CLIENT_KNOBS->DEFAULT_MAX_OUTSTANDING_WATCHES; logger = databaseLogger( this ); @@ -1888,6 +1889,7 @@ void Transaction::operator=(Transaction&& r) noexcept(true) { cx = std::move(r.cx); tr = std::move(r.tr); readVersion = std::move(r.readVersion); + metadataVersion = std::move(r.metadataVersion); extraConflictRanges = std::move(r.extraConflictRanges); commitResult = std::move(r.commitResult); committing = std::move(r.committing); @@ -1934,6 +1936,36 @@ Future> Transaction::get( const Key& key, bool snapshot ) { if( !snapshot ) tr.transaction.read_conflict_ranges.push_back(tr.arena, singleKeyRange(key, tr.arena)); + if(key == metadataVersionKey) { + if(!ver.isReady() || metadataVersion.isSet()) { + return metadataVersion.getFuture(); + } else { + if(ver.isError()) return ver.getError(); + if(ver.get() == cx->metadataVersionCache[cx->mvcInsertLocation].first) { + return cx->metadataVersionCache[cx->mvcInsertLocation].second; + } + + Version v = ver.get(); + int hi = cx->mvcInsertLocation; + int lo = (cx->mvcInsertLocation+1)%cx->metadataVersionCache.size(); + + while(true) { + int cu = hi > lo ? (hi + lo)/2 : ((hi + cx->metadataVersionCache.size() + lo)/2)%cx->metadataVersionCache.size(); + if(v == cx->metadataVersionCache[cu].first) { + return cx->metadataVersionCache[cu].second; + } + if(cu == lo) { + break; + } + if(v < cx->metadataVersionCache[cu].first) { + hi = cu; + } else { + lo = (cu+1)%cx->metadataVersionCache.size(); + } + } + } + } + return getValue( ver, key, cx, info, trLogInfo ); } @@ -2239,6 +2271,7 @@ double Transaction::getBackoff(int errCode) { void Transaction::reset() { tr = CommitTransactionRequest(); readVersion = Future(); + metadataVersion = Promise>(); extraConflictRanges.clear(); versionstampPromise = Promise>(); commitResult = Promise(); @@ -2813,7 +2846,7 @@ ACTOR Future readVersionBatcher( DatabaseContext *cx, FutureStream< std::p } } -ACTOR Future extractReadVersion(DatabaseContext* cx, Reference trLogInfo, Future f, bool lockAware, double startTime) { +ACTOR Future extractReadVersion(DatabaseContext* cx, Reference trLogInfo, Future f, bool lockAware, double startTime, Promise> metadataVersion) { GetReadVersionReply rep = wait(f); double latency = now() - startTime; cx->GRVLatencies.addSample(latency); @@ -2822,6 +2855,12 @@ ACTOR Future extractReadVersion(DatabaseContext* cx, Reference cx->metadataVersionCache[cx->mvcInsertLocation].first) { + cx->mvcInsertLocation = (cx->mvcInsertLocation + 1)%cx->metadataVersionCache.size(); + cx->metadataVersionCache[cx->mvcInsertLocation] = std::make_pair(rep.version, rep.metadataVersion); + } + + metadataVersion.send(rep.metadataVersion); return rep.version; } @@ -2837,7 +2876,7 @@ Future Transaction::getReadVersion(uint32_t flags) { Promise p; batcher.stream.send( std::make_pair( p, info.debugID ) ); startTime = now(); - readVersion = extractReadVersion( cx.getPtr(), trLogInfo, p.getFuture(), options.lockAware, startTime); + readVersion = extractReadVersion( cx.getPtr(), trLogInfo, p.getFuture(), options.lockAware, startTime, metadataVersion); } return readVersion; } diff --git a/fdbclient/NativeAPI.actor.h b/fdbclient/NativeAPI.actor.h index 90c04f86c8..b785a35c47 100644 --- a/fdbclient/NativeAPI.actor.h +++ b/fdbclient/NativeAPI.actor.h @@ -314,6 +314,7 @@ private: Version committedVersion; CommitTransactionRequest tr; Future readVersion; + Promise> metadataVersion; vector>> extraConflictRanges; Promise commitResult; Future committing; diff --git a/fdbclient/ReadYourWrites.actor.cpp b/fdbclient/ReadYourWrites.actor.cpp index b7240329db..4cfbf8e8bd 100644 --- a/fdbclient/ReadYourWrites.actor.cpp +++ b/fdbclient/ReadYourWrites.actor.cpp @@ -1046,7 +1046,7 @@ public: return Void(); } - ryw->writeRangeToNativeTransaction(KeyRangeRef(StringRef(), ryw->getMaxWriteKey())); + ryw->writeRangeToNativeTransaction(KeyRangeRef(StringRef(), allKeys.end)); auto conflictRanges = ryw->readConflicts.ranges(); for( auto iter = conflictRanges.begin(); iter != conflictRanges.end(); ++iter ) { @@ -1222,7 +1222,7 @@ Future< Optional > ReadYourWritesTransaction::get( const Key& key, bool s if( resetPromise.isSet() ) return resetPromise.getFuture().getError(); - if(key >= getMaxReadKey()) + if(key >= getMaxReadKey() && (!tr.apiVersionAtLeast(610) || key != metadataVersionKey)) return key_outside_legal_range(); //There are no keys in the database with size greater than KEY_SIZE_LIMIT @@ -1499,8 +1499,18 @@ void ReadYourWritesTransaction::atomicOp( const KeyRef& key, const ValueRef& ope throw used_during_commit(); } - if(key >= getMaxWriteKey()) + if (key == metadataVersionKey) { + if(!tr.apiVersionAtLeast(610)) { + throw key_outside_legal_range(); + } + + if(operationType != MutationRef::SetVersionstampedValue || operand != metadataVersionRequiredValue) { + throw client_invalid_operation(); + } + } + else if(key >= getMaxWriteKey()) { throw key_outside_legal_range(); + } if(!isValidMutationType(operationType) || !isAtomicOp((MutationRef::Type) operationType)) throw invalid_mutation_type(); @@ -1565,7 +1575,10 @@ void ReadYourWritesTransaction::set( const KeyRef& key, const ValueRef& value ) BinaryReader::fromStringRef(value, IncludeVersion()).reboot.send( RebootRequest(false, true) ); return; } - + if (key == metadataVersionKey) { + throw client_invalid_operation(); + } + bool addWriteConflict = !options.getAndResetWriteConflictDisabled(); if(checkUsedDuringCommit()) { diff --git a/fdbclient/SystemData.cpp b/fdbclient/SystemData.cpp index ea4be9c8d8..4c1a40b145 100644 --- a/fdbclient/SystemData.cpp +++ b/fdbclient/SystemData.cpp @@ -590,6 +590,8 @@ std::pair decodeMetricConfKey( KeyRef const& prefix, KeyR const KeyRef maxUIDKey = LiteralStringRef("\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff"); const KeyRef databaseLockedKey = LiteralStringRef("\xff/dbLocked"); +const KeyRef metadataVersionKey = LiteralStringRef("\xff/metadataVersion"); +const KeyRef metadataVersionRequiredValue = LiteralStringRef("\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00"); const KeyRef mustContainSystemMutationsKey = LiteralStringRef("\xff/mustContainSystemMutations"); const KeyRangeRef monitorConfKeys( diff --git a/fdbclient/SystemData.h b/fdbclient/SystemData.h index e136d83986..3215e16a2e 100644 --- a/fdbclient/SystemData.h +++ b/fdbclient/SystemData.h @@ -265,6 +265,8 @@ extern const KeyRef metricConfPrefix; extern const KeyRef maxUIDKey; extern const KeyRef databaseLockedKey; +extern const KeyRef metadataVersionKey; +extern const KeyRef metadataVersionRequiredValue; extern const KeyRef mustContainSystemMutationsKey; // Key range reserved for storing changes to monitor conf files diff --git a/fdbserver/ApplyMetadataMutation.h b/fdbserver/ApplyMetadataMutation.h index 8c833eb24c..d7f3ebc42b 100644 --- a/fdbserver/ApplyMetadataMutation.h +++ b/fdbserver/ApplyMetadataMutation.h @@ -178,7 +178,7 @@ static void applyMetadataMutations(UID const& dbgid, Arena &arena, VectorRefset(KeyValueRef(m.param1, m.param2)); } @@ -225,6 +225,9 @@ static void applyMetadataMutations(UID const& dbgid, Arena &arena, VectorRefmodify(KeyRangeRef(logRangeBegin, logRangeEnd))) { logRange->value().insert(logDestination); } + for (auto& logRange : vecBackupKeys->modify(singleKeyRange(metadataVersionKey))) { + logRange->value().insert(logDestination); + } // Log the modification TraceEvent("LogRangeAdd").detail("LogRanges", vecBackupKeys->size()).detail("MutationKey", printable(m.param1)) @@ -345,6 +348,9 @@ static void applyMetadataMutations(UID const& dbgid, Arena &arena, VectorRefclear(singleKeyRange(databaseLockedKey)); } + if (range.contains(metadataVersionKey)) { + if(!initialCommit) txnStateStore->clear(singleKeyRange(metadataVersionKey)); + } if (range.contains(mustContainSystemMutationsKey)) { if(!initialCommit) txnStateStore->clear(singleKeyRange(mustContainSystemMutationsKey)); } @@ -417,6 +423,21 @@ static void applyMetadataMutations(UID const& dbgid, Arena &arena, VectorRefintersectingRanges(normalKeys)) { + if(it.value().count(logDestination) > 0) { + foundKey = true; + break; + } + } + if(!foundKey) { + auto logRanges = vecBackupKeys->modify(singleKeyRange(metadataVersionKey)); + for (auto logRange : logRanges) { + auto &logRangeMap = logRange->value(); + logRangeMap.erase(logDestination); + } + } + // Coallesce the entire range vecBackupKeys->coalesce(allKeys); } diff --git a/fdbserver/MasterProxyServer.actor.cpp b/fdbserver/MasterProxyServer.actor.cpp index 33082f3900..26b9b8c32a 100644 --- a/fdbserver/MasterProxyServer.actor.cpp +++ b/fdbserver/MasterProxyServer.actor.cpp @@ -206,6 +206,7 @@ struct ProxyCommitData { bool firstProxy; double lastCoalesceTime; bool locked; + Optional metadataVersion; double commitBatchInterval; int64_t localCommitBatchesStarted; @@ -642,6 +643,8 @@ ACTOR Future commitBatch( lockedKey = self->txnStateStore->readValue(databaseLockedKey).get(); state bool lockedAfter = lockedKey.present() && lockedKey.get().size(); + state Optional metadataVersionAfter = self->txnStateStore->readValue(metadataVersionKey).get(); + auto fcm = self->logAdapter->getCommitMessage(); storeCommits.push_back(std::make_pair(fcm, self->txnStateStore->commit())); self->version = commitVersion; @@ -748,54 +751,36 @@ ACTOR Future commitBatch( else UNREACHABLE(); - // Check on backing up key, if backup ranges are defined and a normal key - if ((self->vecBackupKeys.size() > 1) && normalKeys.contains(m.param1)) { - if (isAtomicOp((MutationRef::Type)m.type)) { + + // Check on backing up key, if backup ranges are defined and a normal key + if (self->vecBackupKeys.size() > 1 && (normalKeys.contains(m.param1) || m.param1 == metadataVersionKey)) { + if (m.type != MutationRef::Type::ClearRange) { // Add the mutation to the relevant backup tag for (auto backupName : self->vecBackupKeys[m.param1]) { logRangeMutations[backupName].push_back_deep(logRangeMutationsArena, m); } } else { - switch (m.type) + KeyRangeRef mutationRange(m.param1, m.param2); + KeyRangeRef intersectionRange; + + // Identify and add the intersecting ranges of the mutation to the array of mutations to serialize + for (auto backupRange : self->vecBackupKeys.intersectingRanges(mutationRange)) { - // Backup the mutation, if within a backup range - case MutationRef::Type::SetValue: + // Get the backup sub range + const auto& backupSubrange = backupRange.range(); + + // Determine the intersecting range + intersectionRange = mutationRange & backupSubrange; + + // Create the custom mutation for the specific backup tag + MutationRef backupMutation(MutationRef::Type::ClearRange, intersectionRange.begin, intersectionRange.end); + // Add the mutation to the relevant backup tag - for (auto backupName : self->vecBackupKeys[m.param1]) { - logRangeMutations[backupName].push_back_deep(logRangeMutationsArena, m); + for (auto backupName : backupRange.value()) { + logRangeMutations[backupName].push_back_deep(logRangeMutationsArena, backupMutation); } - break; - - case MutationRef::Type::ClearRange: - { - KeyRangeRef mutationRange(m.param1, m.param2); - KeyRangeRef intersectionRange; - - // Identify and add the intersecting ranges of the mutation to the array of mutations to serialize - for (auto backupRange : self->vecBackupKeys.intersectingRanges(mutationRange)) - { - // Get the backup sub range - const auto& backupSubrange = backupRange.range(); - - // Determine the intersecting range - intersectionRange = mutationRange & backupSubrange; - - // Create the custom mutation for the specific backup tag - MutationRef backupMutation(MutationRef::Type::ClearRange, intersectionRange.begin, intersectionRange.end); - - // Add the mutation to the relevant backup tag - for (auto backupName : backupRange.value()) { - logRangeMutations[backupName].push_back_deep(logRangeMutationsArena, backupMutation); - } - } - } - break; - - default: - UNREACHABLE(); - break; } } } @@ -880,6 +865,7 @@ ACTOR Future commitBatch( when(GetReadVersionReply v = wait(self->getConsistentReadVersion.getReply(GetReadVersionRequest(0, GetReadVersionRequest::PRIORITY_SYSTEM_IMMEDIATE | GetReadVersionRequest::FLAG_CAUSAL_READ_RISKY)))) { if(v.version > self->committedVersion.get()) { self->locked = v.locked; + self->metadataVersion = v.metadataVersion; self->committedVersion.set(v.version); } @@ -964,6 +950,7 @@ ACTOR Future commitBatch( TEST(self->committedVersion.get() > commitVersion); // A later version was reported committed first if( commitVersion > self->committedVersion.get() ) { self->locked = lockedAfter; + self->metadataVersion = metadataVersionAfter; self->committedVersion.set(commitVersion); } @@ -1052,7 +1039,8 @@ ACTOR Future getLiveCommittedVersion(ProxyCommitData* commi GetReadVersionReply rep; rep.version = commitData->committedVersion.get(); rep.locked = commitData->locked; - + rep.metadataVersion = commitData->metadataVersion; + for (auto v : versions) { if(v.version > rep.version) { rep = v; @@ -1495,6 +1483,7 @@ ACTOR Future masterProxyServerCore( g_traceBatch.addEvent("TransactionDebug", req.debugID.get().first(), "MasterProxyServer.masterProxyServerCore.GetRawCommittedVersion"); GetReadVersionReply rep; rep.locked = commitData.locked; + rep.metadataVersion = commitData.metadataVersion; rep.version = commitData.committedVersion.get(); req.reply.send(rep); } @@ -1577,6 +1566,7 @@ ACTOR Future masterProxyServerCore( auto lockedKey = commitData.txnStateStore->readValue(databaseLockedKey).get(); commitData.locked = lockedKey.present() && lockedKey.get().size(); + commitData.metadataVersion = commitData.txnStateStore->readValue(metadataVersionKey).get(); commitData.txnStateStore->enableSnapshot(); } diff --git a/fdbserver/masterserver.actor.cpp b/fdbserver/masterserver.actor.cpp index 4758f27d43..0c97349a28 100644 --- a/fdbserver/masterserver.actor.cpp +++ b/fdbserver/masterserver.actor.cpp @@ -493,6 +493,8 @@ ACTOR Future> provisionalMaster( ReferencetxnStateStore->readValue(databaseLockedKey).get(); state bool locked = lockedKey.present() && lockedKey.get().size(); + state Optional metadataVersion = parent->txnStateStore->readValue(metadataVersionKey).get(); + // We respond to a minimal subset of the master proxy protocol. Our sole purpose is to receive a single write-only transaction // which might repair our configuration, and return it. loop choose { @@ -501,6 +503,7 @@ ACTOR Future> provisionalMaster( ReferencelastEpochEnd; rep.locked = locked; + rep.metadataVersion = metadataVersion; req.reply.send( rep ); } else req.reply.send(Never()); // We can't perform causally consistent reads without recovering diff --git a/fdbserver/workloads/VersionStamp.actor.cpp b/fdbserver/workloads/VersionStamp.actor.cpp index dc11f84b0a..75460089a1 100644 --- a/fdbserver/workloads/VersionStamp.actor.cpp +++ b/fdbserver/workloads/VersionStamp.actor.cpp @@ -39,6 +39,8 @@ struct VersionStampWorkload : TestWorkload { bool validateExtraDB; std::map>>> key_commit; std::map>>> versionStampKey_commit; + int apiVersion; + bool onlyTestChangingMetadataVersion; VersionStampWorkload(WorkloadContext const& wcx) : TestWorkload(wcx) @@ -52,6 +54,7 @@ struct VersionStampWorkload : TestWorkload { vsKeyPrefix = LiteralStringRef("K_").withPrefix(prefix); vsValuePrefix = LiteralStringRef("V_").withPrefix(prefix); validateExtraDB = getOption(options, LiteralStringRef("validateExtraDB"), false); + onlyTestChangingMetadataVersion = getOption(options, LiteralStringRef("onlyTestChangingMetadataVersion"), false); } virtual std::string description() { return "VersionStamp"; } @@ -62,7 +65,6 @@ struct VersionStampWorkload : TestWorkload { // Versionstamp behavior changed starting with API version 520, so // choose a version to check compatibility. double choice = g_random->random01(); - int apiVersion; if (choice < 0.1) { apiVersion = 500; } @@ -83,6 +85,10 @@ struct VersionStampWorkload : TestWorkload { } Key keyForIndex(uint64_t index) { + if((apiVersion >= 610 || apiVersion == Database::API_VERSION_LATEST) && index == 0) { + return metadataVersionKey; + } + Key result = makeString(keyBytes); uint8_t* data = mutateString(result); memset(data, '.', keyBytes); @@ -157,11 +163,23 @@ struct VersionStampWorkload : TestWorkload { // We specifically wish to grab the smalles read version that we can get and maintain it, to // have the strictest check we can on versionstamps monotonically increasing. state Version readVersion = wait(tr.getReadVersion()); + + if(BUGGIFY) { + tr.reset(); + if(g_random->random01() < 0.5) { + readVersion--; + } + tr.setVersion(readVersion); + } + state Standalone result; loop{ try { Standalone result_ = wait(tr.getRange(KeyRangeRef(self->vsValuePrefix, endOfRange(self->vsValuePrefix)), self->nodeCount + 1)); result = result_; + if((self->apiVersion >= 610 || self->apiVersion == Database::API_VERSION_LATEST) && tr.get(metadataVersionKey).get().present() && self->key_commit.count(metadataVersionKey)) { + result.push_back_deep(result.arena(), KeyValueRef(metadataVersionKey,tr.get(metadataVersionKey).get().get())); + } ASSERT(result.size() <= self->nodeCount); if (self->failIfDataLost) { ASSERT(result.size() == self->key_commit.size()); @@ -171,29 +189,40 @@ struct VersionStampWorkload : TestWorkload { //TraceEvent("VST_Check0").detail("Size", result.size()).detail("NodeCount", self->nodeCount).detail("KeyCommit", self->key_commit.size()).detail("ReadVersion", readVersion); for (auto it : result) { - const Standalone key = it.key.removePrefix(self->vsValuePrefix); + const Standalone key = it.key == metadataVersionKey ? metadataVersionKey : it.key.removePrefix(self->vsValuePrefix); Version parsedVersion; Standalone parsedVersionstamp; std::tie(parsedVersion, parsedVersionstamp) = versionFromValue(it.value); + ASSERT(parsedVersion <= readVersion); //TraceEvent("VST_Check0a").detail("ItKey", printable(it.key)).detail("ItValue", printable(it.value)).detail("ParsedVersion", parsedVersion); const auto& all_values_iter = self->key_commit.find(key); ASSERT(all_values_iter != self->key_commit.end()); // Reading a key that we didn't commit. const auto& all_values = all_values_iter->second; - const auto& value_pair_iter = std::find_if(all_values.cbegin(), all_values.cend(), - [parsedVersion](const std::pair>& pair) { return pair.first == parsedVersion; }); - ASSERT(value_pair_iter != all_values.cend()); // The key exists, but we never wrote the timestamp. - if (self->failIfDataLost) { - auto last_element_iter = all_values.cend(); last_element_iter--; - ASSERT(value_pair_iter == last_element_iter); - } - Version commitVersion = value_pair_iter->first; - Standalone commitVersionstamp = value_pair_iter->second; + if(it.key == metadataVersionKey && !self->onlyTestChangingMetadataVersion) { + if(self->failIfDataLost) { + for(auto& it : all_values) { + ASSERT(it.first <= parsedVersion); + if(it.first == parsedVersion) { + ASSERT(it.second.compare(parsedVersionstamp) == 0); + } + } + } + } else { + const auto& value_pair_iter = std::find_if(all_values.cbegin(), all_values.cend(), + [parsedVersion](const std::pair>& pair) { return pair.first == parsedVersion; }); + ASSERT(value_pair_iter != all_values.cend()); // The key exists, but we never wrote the timestamp. + if (self->failIfDataLost) { + auto last_element_iter = all_values.cend(); last_element_iter--; + ASSERT(value_pair_iter == last_element_iter); + } + Version commitVersion = value_pair_iter->first; + Standalone commitVersionstamp = value_pair_iter->second; - //TraceEvent("VST_Check0b").detail("Version", commitVersion).detail("CommitVersion", printable(commitVersionstamp)); - ASSERT(parsedVersion <= readVersion); - ASSERT(commitVersionstamp.compare(parsedVersionstamp) == 0); + //TraceEvent("VST_Check0b").detail("Version", commitVersion).detail("CommitVersion", printable(commitVersionstamp)); + ASSERT(commitVersionstamp.compare(parsedVersionstamp) == 0); + } } Standalone result__ = wait(tr.getRange(KeyRangeRef(self->vsKeyPrefix, endOfRange(self->vsKeyPrefix)), self->nodeCount + 1)); @@ -270,7 +299,11 @@ struct VersionStampWorkload : TestWorkload { state Version committedVersion; state Value versionStampValue; - if (oldVSFormat) { + + if(key == metadataVersionKey) { + value = metadataVersionRequiredValue; + versionStampValue = value; + } else if (oldVSFormat) { versionStampValue = value; } else { versionStampValue = value.withSuffix(LiteralStringRef("\x00\x00\x00\x00")); @@ -311,7 +344,7 @@ struct VersionStampWorkload : TestWorkload { } const Version value_version = versionFromValue(vs_value.get()).first; //TraceEvent("VST_CommitUnknownRead").detail("VsValue", vs_value.present() ? printable(vs_value.get()) : "did not exist"); - const auto& value_ts = self->key_commit[key.removePrefix(self->vsValuePrefix)]; + const auto& value_ts = self->key_commit[key == metadataVersionKey ? metadataVersionKey : key.removePrefix(self->vsValuePrefix)]; const auto& iter = std::find_if(value_ts.cbegin(), value_ts.cend(), [value_version](const std::pair>& pair) { return value_version == pair.first; @@ -343,7 +376,7 @@ struct VersionStampWorkload : TestWorkload { const Standalone vsKeyKey = versionStampKey.removePrefix(self->vsKeyPrefix).substr(4, 16); const auto& committedVersionPair = std::make_pair(committedVersion, committedVersionStamp); //TraceEvent("VST_CommitSuccess").detail("Key", printable(key)).detail("VsKey", printable(versionStampKey)).detail("VsKeyKey", printable(vsKeyKey)).detail("Clear", printable(range)).detail("Version", tr.getCommittedVersion()).detail("VsValue", printable(committedVersionPair.second)); - self->key_commit[key.removePrefix(self->vsValuePrefix)].push_back(committedVersionPair); + self->key_commit[key == metadataVersionKey ? metadataVersionKey : key.removePrefix(self->vsValuePrefix)].push_back(committedVersionPair); self->versionStampKey_commit[vsKeyKey].push_back(committedVersionPair); break; } diff --git a/tests/fast/VersionStamp.txt b/tests/fast/VersionStamp.txt index 8a993597ee..2925a39e0e 100644 --- a/tests/fast/VersionStamp.txt +++ b/tests/fast/VersionStamp.txt @@ -1,2 +1,3 @@ testTitle=VersionStamp -testName=VersionStamp \ No newline at end of file +testName=VersionStamp +onlyTestChangingMetadataVersion=true From 7ef189701e3ec2496dd080f57fec124ae469ceaa Mon Sep 17 00:00:00 2001 From: Alec Grieser Date: Thu, 28 Feb 2019 14:33:59 -0800 Subject: [PATCH 07/56] Resolves #719: Support `.setReadVersion()` on `ReadTransaction` --- .../apple/foundationdb/FDBTransaction.java | 40 ++++++++++++ .../apple/foundationdb/ReadTransaction.java | 61 ++++++++++++++++++- .../com/apple/foundationdb/Transaction.java | 27 +------- documentation/sphinx/source/release-notes.rst | 1 + 4 files changed, 102 insertions(+), 27 deletions(-) diff --git a/bindings/java/src/main/com/apple/foundationdb/FDBTransaction.java b/bindings/java/src/main/com/apple/foundationdb/FDBTransaction.java index 49354b993d..b0e904f725 100644 --- a/bindings/java/src/main/com/apple/foundationdb/FDBTransaction.java +++ b/bindings/java/src/main/com/apple/foundationdb/FDBTransaction.java @@ -40,11 +40,26 @@ class FDBTransaction extends NativeObjectWrapper implements Transaction, OptionC public final ReadTransaction snapshot; class ReadSnapshot implements ReadTransaction { + @Override + public boolean isSnapshot() { + return true; + } + + @Override + public ReadTransaction snapshot() { + return this; + } + @Override public CompletableFuture getReadVersion() { return FDBTransaction.this.getReadVersion(); } + @Override + public void setReadVersion(long version) { + FDBTransaction.this.setReadVersion(version); + } + @Override public CompletableFuture get(byte[] key) { return get_internal(key, true); @@ -126,6 +141,16 @@ class FDBTransaction extends NativeObjectWrapper implements Transaction, OptionC return getRange(range, ReadTransaction.ROW_LIMIT_UNLIMITED); } + @Override + public void addReadConflictRangeIfNotSnapshot(byte[] keyBegin, byte[] keyEnd) { + // Do nothing + } + + @Override + public void addReadConflictKeyIfNotSnapshot(byte[] key) { + // Do nothing + } + @Override public TransactionOptions options() { return FDBTransaction.this.options(); @@ -157,6 +182,11 @@ class FDBTransaction extends NativeObjectWrapper implements Transaction, OptionC transactionOwner = true; } + @Override + public boolean isSnapshot() { + return false; + } + @Override public ReadTransaction snapshot() { return snapshot; @@ -321,11 +351,21 @@ class FDBTransaction extends NativeObjectWrapper implements Transaction, OptionC } } + @Override + public void addReadConflictRangeIfNotSnapshot(byte[] keyBegin, byte[] keyEnd) { + addReadConflictRange(keyBegin, keyEnd); + } + @Override public void addReadConflictRange(byte[] keyBegin, byte[] keyEnd) { addConflictRange(keyBegin, keyEnd, ConflictRangeType.READ); } + @Override + public void addReadConflictKeyIfNotSnapshot(byte[] key) { + addReadConflictKey(key); + } + @Override public void addReadConflictKey(byte[] key) { addConflictRange(key, ByteArrayUtil.join(key, new byte[]{(byte) 0}), ConflictRangeType.READ); diff --git a/bindings/java/src/main/com/apple/foundationdb/ReadTransaction.java b/bindings/java/src/main/com/apple/foundationdb/ReadTransaction.java index 5130ce01ff..b7af660e9a 100644 --- a/bindings/java/src/main/com/apple/foundationdb/ReadTransaction.java +++ b/bindings/java/src/main/com/apple/foundationdb/ReadTransaction.java @@ -32,7 +32,7 @@ import com.apple.foundationdb.tuple.Tuple; *
* Note: Client must call {@link Transaction#commit()} and wait on the result on all transactions, * even ones that only read. This is done automatically when using the retry loops from - * {@link Database#run(Function)}. This is explained more in the intro to {@link Transaction}. + * {@link Database#run(java.util.function.Function)}. This is explained more in the intro to {@link Transaction}. * * @see Transaction */ @@ -43,12 +43,71 @@ public interface ReadTransaction extends ReadTransactionContext { */ int ROW_LIMIT_UNLIMITED = 0; + /** + * Gets whether this transaction is a snapshot view of the database. In other words, this returns + * whether read conflict ranges are omitted for any reads done through this {@code ReadTransaction}. + *
+ * For more information about how to use snapshot reads correctly, see + * Using snapshot reads. + * + * @return whether this is a snapshot view of the database with relaxed isolation properties + * @see #snapshot() + */ + boolean isSnapshot(); + + /** + * Return a special-purpose, read-only view of the database. Reads done through this interface are known as "snapshot reads". + * Snapshot reads selectively relax FoundationDB's isolation property, reducing + * Transaction conflicts + * but making reasoning about concurrency harder.
+ *
+ * For more information about how to use snapshot reads correctly, see + * Using snapshot reads. + * + * @return a read-only view of this {@code ReadTransaction} with relaxed isolation properties + */ + ReadTransaction snapshot(); + /** * Gets the version at which the reads for this {@code Transaction} will access the database. * @return the version for database reads */ CompletableFuture getReadVersion(); + /** + * Directly sets the version of the database at which to execute reads. The + * normal operation of a transaction is to determine an appropriately recent + * version; this call overrides that behavior. If the version is set too + * far in the past, {@code past_version} errors will be thrown from read operations. + * Infrequently used. + * + * @param version the version at which to read from the database + */ + void setReadVersion(long version); + + /** + * Adds the read conflict range that this {@code ReadTransaction} would have added as if it had read + * the given key range. If this is a {@linkplain #snapshot() snapshot} view of the database, this will + * not add the conflict range. This mirrors how reading a range through a snapshot view + * of the database does not add a conflict range for the read keys. + * + * @param keyBegin the first key in the range (inclusive) + * @param keyEnd the last key in the range (exclusive) + * @see Transaction#addReadConflictRange(byte[], byte[]) + */ + void addReadConflictRangeIfNotSnapshot(byte[] keyBegin, byte[] keyEnd); + + /** + * Adds the read conflict range that this {@code ReadTransaction} would have added as if it had read + * the given key. If this is a {@linkplain #snapshot() snapshot} view of the database, this will + * not add the conflict range. This mirrors how reading a key through a snapshot view + * of the database does not add a conflict range for the read key. + * + * @param key the key to add to the read conflict range set (it this is not a snapshot view of the database) + * @see Transaction#addReadConflictKey(byte[]) + */ + void addReadConflictKeyIfNotSnapshot(byte[] key); + /** * Gets a value from the database. The call will return {@code null} if the key is not * present in the database. diff --git a/bindings/java/src/main/com/apple/foundationdb/Transaction.java b/bindings/java/src/main/com/apple/foundationdb/Transaction.java index 2b87736d7b..c3a8c6b671 100644 --- a/bindings/java/src/main/com/apple/foundationdb/Transaction.java +++ b/bindings/java/src/main/com/apple/foundationdb/Transaction.java @@ -76,31 +76,6 @@ import com.apple.foundationdb.tuple.Tuple; */ public interface Transaction extends AutoCloseable, ReadTransaction, TransactionContext { - /** - * Return special-purpose, read-only view of the database. Reads done through this interface are known as "snapshot reads". - * Snapshot reads selectively relax FoundationDB's isolation property, reducing - * Transaction conflicts - * but making reasoning about concurrency harder.
- *
- * For more information about how to use snapshot reads correctly, see - * Using snapshot reads. - * - * @return a read-only view of this {@code Transaction} with relaxed isolation properties - */ - ReadTransaction snapshot(); - - /** - * Directly sets the version of the database at which to execute reads. The - * normal operation of a transaction is to determine an appropriately recent - * version; this call overrides that behavior. If the version is set too - * far in the past, {@code past_version} errors will be thrown from read operations. - * Infrequently used. - * - * @param version the version at which to read from the database - */ - void setReadVersion(long version); - - /** * Adds a range of keys to the transaction's read conflict ranges as if you * had read the range. As a result, other transactions that write a key in @@ -116,7 +91,7 @@ public interface Transaction extends AutoCloseable, ReadTransaction, Transaction * the key. As a result, other transactions that concurrently write this key * could cause the transaction to fail with a conflict. * - * @param key the key to be added to the range + * @param key the key to be added to the read conflict range set */ void addReadConflictKey(byte[] key); diff --git a/documentation/sphinx/source/release-notes.rst b/documentation/sphinx/source/release-notes.rst index fa83fc2875..a9d11f4d56 100644 --- a/documentation/sphinx/source/release-notes.rst +++ b/documentation/sphinx/source/release-notes.rst @@ -35,6 +35,7 @@ Bindings * Python: Removed ``fdb.init``, ``fdb.create_cluster``, and ``fdb.Cluster``. ``fdb.open`` no longer accepts a ``database_name`` parameter. `(PR #942) `_ * Java: Deprecated ``FDB.createCluster`` and ``Cluster``. The preferred way to get a ``Database`` is by using ``FDB.open``, which should work in both new and old API versions. `(PR #942) `_ * Java: Removed ``Cluster(long cPtr, Executor executor)`` constructor. This is API breaking for any code that has subclassed the ``Cluster`` class and is not protected by API versioning. `(PR #942) `_ +* Java: Several methods relevant to read-only transactions have been moved into the ``ReadTransaction`` interface. * Ruby: Removed ``FDB.init``, ``FDB.create_cluster``, and ``FDB.Cluster``. ``FDB.open`` no longer accepts a ``database_name`` parameter. `(PR #942) `_ * Golang: Deprecated ``fdb.StartNetwork``, ``fdb.Open``, ``fdb.MustOpen``, and ``fdb.CreateCluster`` and added ``fdb.OpenDatabase`` and ``fdb.MustOpenDatabase``. The preferred way to start the network and get a ``Database`` is by using ``FDB.OpenDatabase`` or ``FDB.OpenDefault``. `(PR #942) `_ * Flow: Deprecated ``API::createCluster`` and ``Cluster`` and added ``API::createDatabase``. The preferred way to get a ``Database`` is by using ``API::createDatabase``. `(PR #942) `_ From eb8a085cf9cf9588d2bc5ec80a912c0b58ba4cc3 Mon Sep 17 00:00:00 2001 From: Alec Grieser Date: Sat, 2 Mar 2019 09:44:15 -0800 Subject: [PATCH 08/56] conditional add read conflict methods now return whether they added the conflict range ; test added for snapshot transactions --- .../apple/foundationdb/FDBTransaction.java | 16 +- .../apple/foundationdb/ReadTransaction.java | 6 +- .../test/SnapshotTransactionTest.java | 209 ++++++++++++++++++ 3 files changed, 223 insertions(+), 8 deletions(-) create mode 100644 bindings/java/src/test/com/apple/foundationdb/test/SnapshotTransactionTest.java diff --git a/bindings/java/src/main/com/apple/foundationdb/FDBTransaction.java b/bindings/java/src/main/com/apple/foundationdb/FDBTransaction.java index b0e904f725..0e507c914d 100644 --- a/bindings/java/src/main/com/apple/foundationdb/FDBTransaction.java +++ b/bindings/java/src/main/com/apple/foundationdb/FDBTransaction.java @@ -142,13 +142,15 @@ class FDBTransaction extends NativeObjectWrapper implements Transaction, OptionC } @Override - public void addReadConflictRangeIfNotSnapshot(byte[] keyBegin, byte[] keyEnd) { - // Do nothing + public boolean addReadConflictRangeIfNotSnapshot(byte[] keyBegin, byte[] keyEnd) { + // This is a snapshot transaction; do not add the conflict range. + return false; } @Override - public void addReadConflictKeyIfNotSnapshot(byte[] key) { - // Do nothing + public boolean addReadConflictKeyIfNotSnapshot(byte[] key) { + // This is a snapshot transaction; do not add the conflict key. + return false; } @Override @@ -352,8 +354,9 @@ class FDBTransaction extends NativeObjectWrapper implements Transaction, OptionC } @Override - public void addReadConflictRangeIfNotSnapshot(byte[] keyBegin, byte[] keyEnd) { + public boolean addReadConflictRangeIfNotSnapshot(byte[] keyBegin, byte[] keyEnd) { addReadConflictRange(keyBegin, keyEnd); + return true; } @Override @@ -362,8 +365,9 @@ class FDBTransaction extends NativeObjectWrapper implements Transaction, OptionC } @Override - public void addReadConflictKeyIfNotSnapshot(byte[] key) { + public boolean addReadConflictKeyIfNotSnapshot(byte[] key) { addReadConflictKey(key); + return true; } @Override diff --git a/bindings/java/src/main/com/apple/foundationdb/ReadTransaction.java b/bindings/java/src/main/com/apple/foundationdb/ReadTransaction.java index b7af660e9a..3f44e2ba36 100644 --- a/bindings/java/src/main/com/apple/foundationdb/ReadTransaction.java +++ b/bindings/java/src/main/com/apple/foundationdb/ReadTransaction.java @@ -93,9 +93,10 @@ public interface ReadTransaction extends ReadTransactionContext { * * @param keyBegin the first key in the range (inclusive) * @param keyEnd the last key in the range (exclusive) + * @return {@code true} if the read conflict range was added and {@code false} otherwise * @see Transaction#addReadConflictRange(byte[], byte[]) */ - void addReadConflictRangeIfNotSnapshot(byte[] keyBegin, byte[] keyEnd); + boolean addReadConflictRangeIfNotSnapshot(byte[] keyBegin, byte[] keyEnd); /** * Adds the read conflict range that this {@code ReadTransaction} would have added as if it had read @@ -104,9 +105,10 @@ public interface ReadTransaction extends ReadTransactionContext { * of the database does not add a conflict range for the read key. * * @param key the key to add to the read conflict range set (it this is not a snapshot view of the database) + * @return {@code true} if the read conflict key was added and {@code false} otherwise * @see Transaction#addReadConflictKey(byte[]) */ - void addReadConflictKeyIfNotSnapshot(byte[] key); + boolean addReadConflictKeyIfNotSnapshot(byte[] key); /** * Gets a value from the database. The call will return {@code null} if the key is not diff --git a/bindings/java/src/test/com/apple/foundationdb/test/SnapshotTransactionTest.java b/bindings/java/src/test/com/apple/foundationdb/test/SnapshotTransactionTest.java new file mode 100644 index 0000000000..d7532f6bfe --- /dev/null +++ b/bindings/java/src/test/com/apple/foundationdb/test/SnapshotTransactionTest.java @@ -0,0 +1,209 @@ +/* + * SnapshotTransactionTest.java + * + * This source file is part of the FoundationDB open source project + * + * Copyright 2013-2018 Apple Inc. and the FoundationDB project authors + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package com.apple.foundationdb.test; + +import java.util.UUID; +import java.util.concurrent.CompletionException; + +import com.apple.foundationdb.Database; +import com.apple.foundationdb.FDB; +import com.apple.foundationdb.FDBException; +import com.apple.foundationdb.ReadTransaction; +import com.apple.foundationdb.Transaction; +import com.apple.foundationdb.subspace.Subspace; +import com.apple.foundationdb.tuple.Tuple; + +/** + * Some tests regarding conflict ranges to make sure they do what we expect. + */ +public class SnapshotTransactionTest { + private static final int CONFLICT_CODE = 1020; + private static final Subspace SUBSPACE = new Subspace(Tuple.from("test", "conflict_ranges")); + + public static void main(String[] args) { + FDB fdb = FDB.selectAPIVersion(610); + try(Database db = fdb.open()) { + snapshotReadShouldNotConflict(db); + snapshotShouldNotAddConflictRange(db); + snapshotOnSnapshot(db); + } + } + + // Adding a random write conflict key makes it so the transaction conflicts are actually resolved. + public static void addUUIDConflicts(Transaction... trs) { + for(Transaction tr : trs) { + tr.options().setTimeout(1000); + tr.getReadVersion().join(); + byte[] key = SUBSPACE.pack(Tuple.from("uuids", UUID.randomUUID())); + tr.addReadConflictKey(key); + tr.addWriteConflictKey(key); + } + } + + public static void validateConflict(E e) throws E { + FDBException fdbE = null; + Throwable current = e; + while(current != null && fdbE == null) { + if(current instanceof FDBException) { + fdbE = (FDBException)current; + } + else { + current = current.getCause(); + } + } + if(fdbE == null) { + System.err.println("Error was not caused by FDBException"); + throw e; + } + else { + int errorCode = fdbE.getCode(); + if(errorCode != CONFLICT_CODE) { + System.err.println("FDB error was not caused by a transaction conflict"); + throw e; + } + } + } + + public static void snapshotReadShouldNotConflict(Database db) { + try(Transaction tr1 = db.createTransaction(); Transaction tr2 = db.createTransaction(); Transaction tr3 = db.createTransaction()) { + addUUIDConflicts(tr1, tr2, tr3); + + // Verify reading a *range* causes a conflict + tr1.addWriteConflictKey(SUBSPACE.pack(Tuple.from("foo", 0L))); + tr2.snapshot().getRange(SUBSPACE.range(Tuple.from("foo"))).asList().join(); + tr3.getRange(SUBSPACE.range(Tuple.from("foo"))).asList().join(); + + // Two successful commits + tr1.commit().join(); + tr2.commit().join(); + + // Read from tr3 should conflict with update from tr1. + try { + tr3.commit().join(); + throw new RuntimeException("tr3 did not conflict"); + } catch(CompletionException e) { + validateConflict(e); + } + } + try(Transaction tr1 = db.createTransaction(); Transaction tr2 = db.createTransaction(); Transaction tr3 = db.createTransaction()) { + addUUIDConflicts(tr1, tr2, tr3); + + // Verify reading a *key* causes a conflict + byte[] key = SUBSPACE.pack(Tuple.from("foo", 1066L)); + tr1.addWriteConflictKey(key); + tr2.snapshot().get(key); + tr3.get(key).join(); + + tr1.commit().join(); + tr2.commit().join(); + + try { + tr3.commit().join(); + throw new RuntimeException("tr3 did not conflict"); + } + catch(CompletionException e) { + validateConflict(e); + } + } + } + + public static void snapshotShouldNotAddConflictRange(Database db) { + try(Transaction tr1 = db.createTransaction(); Transaction tr2 = db.createTransaction(); Transaction tr3 = db.createTransaction()) { + addUUIDConflicts(tr1, tr2, tr3); + + // Verify adding a read conflict *range* causes a conflict. + Subspace fooSubspace = SUBSPACE.subspace(Tuple.from("foo")); + tr1.addWriteConflictKey(fooSubspace.pack(Tuple.from(0L))); + byte[] beginKey = fooSubspace.range().begin; + byte[] endKey = fooSubspace.range().end; + if(tr2.snapshot().addReadConflictRangeIfNotSnapshot(beginKey, endKey)) { + throw new RuntimeException("snapshot read said it added a conflict range"); + } + if(!tr3.addReadConflictRangeIfNotSnapshot(beginKey, endKey)) { + throw new RuntimeException("non-snapshot read said it did not add a conflict range"); + } + + // Two successful commits + tr1.commit().join(); + tr2.commit().join(); + + // Read from tr3 should conflict with update from tr1. + try { + tr3.commit().join(); + throw new RuntimeException("tr3 did not conflict"); + } + catch(CompletionException e) { + validateConflict(e); + } + } + try(Transaction tr1 = db.createTransaction(); Transaction tr2 = db.createTransaction(); Transaction tr3 = db.createTransaction()) { + addUUIDConflicts(tr1, tr2, tr3); + + // Verify adding a read conflict *key* causes a conflict. + byte[] key = SUBSPACE.pack(Tuple.from("foo", 1066L)); + tr1.addWriteConflictKey(key); + if(tr2.snapshot().addReadConflictKeyIfNotSnapshot(key)) { + throw new RuntimeException("snapshot read said it added a conflict range"); + } + if(!tr3.addReadConflictKeyIfNotSnapshot(key)) { + throw new RuntimeException("non-snapshot read said it did not add a conflict range"); + } + + // Two successful commits + tr1.commit().join(); + tr2.commit().join(); + + // Read from tr3 should conflict with update from tr1. + try { + tr3.commit().join(); + throw new RuntimeException("tr3 did not conflict"); + } + catch(CompletionException e) { + validateConflict(e); + } + } + } + + private static void snapshotOnSnapshot(Database db) { + try(Transaction tr = db.createTransaction()) { + if(tr.isSnapshot()) { + throw new RuntimeException("new transaction is a snapshot transaction"); + } + ReadTransaction snapshotTr = tr.snapshot(); + if(!snapshotTr.isSnapshot()) { + throw new RuntimeException("snapshot transaction is not a snapshot transaction"); + } + if(snapshotTr == tr) { + throw new RuntimeException("snapshot and regular transaction are pointer-equal"); + } + ReadTransaction snapshotSnapshotTr = snapshotTr.snapshot(); + if(!snapshotSnapshotTr.isSnapshot()) { + throw new RuntimeException("snapshot transaction is not a snapshot transaction"); + } + if(snapshotSnapshotTr != snapshotTr) { + throw new RuntimeException("calling snapshot on a snapshot transaction produced a different transaction"); + } + } + } + + private SnapshotTransactionTest() {} +} + From c1de93f467f48a76e0aee3b4c10bf6a59a14df9a Mon Sep 17 00:00:00 2001 From: Evan Tschannen Date: Sat, 2 Mar 2019 13:55:41 -0800 Subject: [PATCH 09/56] fix: binary search could get stuck in an infinite loop fix: avoid picking a read version which could be before the last real commit fix: we must wait on metadataVersionKey in case it is not already cached fixed review comments --- fdbclient/DatabaseContext.h | 2 +- fdbclient/NativeAPI.actor.cpp | 18 +++++++++--------- fdbclient/ReadYourWrites.actor.cpp | 2 +- fdbserver/workloads/VersionStamp.actor.cpp | 20 ++++++++++++++++---- 4 files changed, 27 insertions(+), 15 deletions(-) diff --git a/fdbclient/DatabaseContext.h b/fdbclient/DatabaseContext.h index c0a0bbbcbc..7313032b4c 100644 --- a/fdbclient/DatabaseContext.h +++ b/fdbclient/DatabaseContext.h @@ -160,7 +160,7 @@ public: int apiVersion; - int mvcInsertLocation; + int mvCacheInsertLocation; std::vector>> metadataVersionCache; }; diff --git a/fdbclient/NativeAPI.actor.cpp b/fdbclient/NativeAPI.actor.cpp index 3fe1032d78..4216902812 100644 --- a/fdbclient/NativeAPI.actor.cpp +++ b/fdbclient/NativeAPI.actor.cpp @@ -474,7 +474,7 @@ DatabaseContext::DatabaseContext( transactionReadVersions(0), transactionLogicalReads(0), transactionPhysicalReads(0), transactionCommittedMutations(0), transactionCommittedMutationBytes(0), transactionsCommitStarted(0), transactionsCommitCompleted(0), transactionsTooOld(0), transactionsFutureVersions(0), transactionsNotCommitted(0), transactionsMaybeCommitted(0), transactionsResourceConstrained(0), outstandingWatches(0), - latencies(1000), readLatencies(1000), commitLatencies(1000), GRVLatencies(1000), mutationsPerCommit(1000), bytesPerCommit(1000), mvcInsertLocation(0) + latencies(1000), readLatencies(1000), commitLatencies(1000), GRVLatencies(1000), mutationsPerCommit(1000), bytesPerCommit(1000), mvCacheInsertLocation(0) { metadataVersionCache.resize(CLIENT_KNOBS->METADATA_VERSION_CACHE_SIZE); maxOutstandingWatches = CLIENT_KNOBS->DEFAULT_MAX_OUTSTANDING_WATCHES; @@ -1943,15 +1943,15 @@ Future> Transaction::get( const Key& key, bool snapshot ) { return metadataVersion.getFuture(); } else { if(ver.isError()) return ver.getError(); - if(ver.get() == cx->metadataVersionCache[cx->mvcInsertLocation].first) { - return cx->metadataVersionCache[cx->mvcInsertLocation].second; + if(ver.get() == cx->metadataVersionCache[cx->mvCacheInsertLocation].first) { + return cx->metadataVersionCache[cx->mvCacheInsertLocation].second; } Version v = ver.get(); - int hi = cx->mvcInsertLocation; - int lo = (cx->mvcInsertLocation+1)%cx->metadataVersionCache.size(); + int hi = cx->mvCacheInsertLocation; + int lo = (cx->mvCacheInsertLocation+1)%cx->metadataVersionCache.size(); - while(true) { + while(hi!=lo) { int cu = hi > lo ? (hi + lo)/2 : ((hi + cx->metadataVersionCache.size() + lo)/2)%cx->metadataVersionCache.size(); if(v == cx->metadataVersionCache[cu].first) { return cx->metadataVersionCache[cu].second; @@ -2872,9 +2872,9 @@ ACTOR Future extractReadVersion(DatabaseContext* cx, Reference cx->metadataVersionCache[cx->mvcInsertLocation].first) { - cx->mvcInsertLocation = (cx->mvcInsertLocation + 1)%cx->metadataVersionCache.size(); - cx->metadataVersionCache[cx->mvcInsertLocation] = std::make_pair(rep.version, rep.metadataVersion); + if(rep.version > cx->metadataVersionCache[cx->mvCacheInsertLocation].first) { + cx->mvCacheInsertLocation = (cx->mvCacheInsertLocation + 1)%cx->metadataVersionCache.size(); + cx->metadataVersionCache[cx->mvCacheInsertLocation] = std::make_pair(rep.version, rep.metadataVersion); } metadataVersion.send(rep.metadataVersion); diff --git a/fdbclient/ReadYourWrites.actor.cpp b/fdbclient/ReadYourWrites.actor.cpp index 4cfbf8e8bd..3cc4534118 100644 --- a/fdbclient/ReadYourWrites.actor.cpp +++ b/fdbclient/ReadYourWrites.actor.cpp @@ -1500,7 +1500,7 @@ void ReadYourWritesTransaction::atomicOp( const KeyRef& key, const ValueRef& ope } if (key == metadataVersionKey) { - if(!tr.apiVersionAtLeast(610)) { + if(!tr.apiVersionAtLeast(610) && key >= getMaxWriteKey()) { throw key_outside_legal_range(); } diff --git a/fdbserver/workloads/VersionStamp.actor.cpp b/fdbserver/workloads/VersionStamp.actor.cpp index 75460089a1..88cf8bfa6f 100644 --- a/fdbserver/workloads/VersionStamp.actor.cpp +++ b/fdbserver/workloads/VersionStamp.actor.cpp @@ -165,10 +165,19 @@ struct VersionStampWorkload : TestWorkload { state Version readVersion = wait(tr.getReadVersion()); if(BUGGIFY) { - tr.reset(); if(g_random->random01() < 0.5) { - readVersion--; + loop { + try { + tr.makeSelfConflicting(); + wait(tr.commit()); + readVersion = tr.getCommittedVersion() - 1; + break; + } catch( Error &e ) { + wait( tr.onError(e) ); + } + } } + tr.reset(); tr.setVersion(readVersion); } @@ -177,8 +186,11 @@ struct VersionStampWorkload : TestWorkload { try { Standalone result_ = wait(tr.getRange(KeyRangeRef(self->vsValuePrefix, endOfRange(self->vsValuePrefix)), self->nodeCount + 1)); result = result_; - if((self->apiVersion >= 610 || self->apiVersion == Database::API_VERSION_LATEST) && tr.get(metadataVersionKey).get().present() && self->key_commit.count(metadataVersionKey)) { - result.push_back_deep(result.arena(), KeyValueRef(metadataVersionKey,tr.get(metadataVersionKey).get().get())); + if((self->apiVersion >= 610 || self->apiVersion == Database::API_VERSION_LATEST) && self->key_commit.count(metadataVersionKey)) { + Optional mVal = wait(tr.get(metadataVersionKey)); + if(mVal.present()) { + result.push_back_deep(result.arena(), KeyValueRef(metadataVersionKey,mVal.get())); + } } ASSERT(result.size() <= self->nodeCount); if (self->failIfDataLost) { From 057ebe56e4c120856e8397771c6f179b65465bad Mon Sep 17 00:00:00 2001 From: Evan Tschannen Date: Sun, 3 Mar 2019 21:31:40 -0800 Subject: [PATCH 10/56] fix: unknownCommit handling relied on soleOwnership of the version stamp keys, so we need to use a second key to track the commit version for the metadataVersionKey renamed a confusing option --- fdbserver/workloads/VersionStamp.actor.cpp | 13 +++++++++---- tests/fast/VersionStamp.txt | 2 +- 2 files changed, 10 insertions(+), 5 deletions(-) diff --git a/fdbserver/workloads/VersionStamp.actor.cpp b/fdbserver/workloads/VersionStamp.actor.cpp index 88cf8bfa6f..2af428a1d0 100644 --- a/fdbserver/workloads/VersionStamp.actor.cpp +++ b/fdbserver/workloads/VersionStamp.actor.cpp @@ -40,7 +40,7 @@ struct VersionStampWorkload : TestWorkload { std::map>>> key_commit; std::map>>> versionStampKey_commit; int apiVersion; - bool onlyTestChangingMetadataVersion; + bool soleOwnerOfMetadataVersionKey; VersionStampWorkload(WorkloadContext const& wcx) : TestWorkload(wcx) @@ -54,7 +54,7 @@ struct VersionStampWorkload : TestWorkload { vsKeyPrefix = LiteralStringRef("K_").withPrefix(prefix); vsValuePrefix = LiteralStringRef("V_").withPrefix(prefix); validateExtraDB = getOption(options, LiteralStringRef("validateExtraDB"), false); - onlyTestChangingMetadataVersion = getOption(options, LiteralStringRef("onlyTestChangingMetadataVersion"), false); + soleOwnerOfMetadataVersionKey = getOption(options, LiteralStringRef("soleOwnerOfMetadataVersionKey"), false); } virtual std::string description() { return "VersionStamp"; } @@ -212,7 +212,7 @@ struct VersionStampWorkload : TestWorkload { ASSERT(all_values_iter != self->key_commit.end()); // Reading a key that we didn't commit. const auto& all_values = all_values_iter->second; - if(it.key == metadataVersionKey && !self->onlyTestChangingMetadataVersion) { + if(it.key == metadataVersionKey && !self->soleOwnerOfMetadataVersionKey) { if(self->failIfDataLost) { for(auto& it : all_values) { ASSERT(it.first <= parsedVersion); @@ -325,8 +325,13 @@ struct VersionStampWorkload : TestWorkload { state bool error = false; state Error err; //TraceEvent("VST_CommitBegin").detail("Key", printable(key)).detail("VsKey", printable(versionStampKey)).detail("Clear", printable(range)); + state Key testKey; try { tr.atomicOp(key, versionStampValue, MutationRef::SetVersionstampedValue); + if(key == metadataVersionKey) { + testKey = "testKey" + g_random->randomUniqueID().toString(); + tr.atomicOp(testKey, versionStampValue, MutationRef::SetVersionstampedValue); + } tr.clear(range); tr.atomicOp(versionStampKey, value, MutationRef::SetVersionstampedKey); state Future> fTrVs = tr.getVersionstamp(); @@ -349,7 +354,7 @@ struct VersionStampWorkload : TestWorkload { state ReadYourWritesTransaction cur_tr(cx_is_primary ? cx : extraDB); cur_tr.setOption(FDBTransactionOptions::LOCK_AWARE); try { - Optional vs_value = wait(cur_tr.get(key)); + Optional vs_value = wait(cur_tr.get(key == metadataVersionKey ? testKey : key)); if (!vs_value.present()) { error = true; break; diff --git a/tests/fast/VersionStamp.txt b/tests/fast/VersionStamp.txt index 2925a39e0e..9c06375805 100644 --- a/tests/fast/VersionStamp.txt +++ b/tests/fast/VersionStamp.txt @@ -1,3 +1,3 @@ testTitle=VersionStamp testName=VersionStamp -onlyTestChangingMetadataVersion=true +soleOwnerOfMetadataVersionKey=true From 5231ee6da0837dd85140961d78e05d94cd5f6383 Mon Sep 17 00:00:00 2001 From: Evan Tschannen Date: Sun, 3 Mar 2019 21:34:21 -0800 Subject: [PATCH 11/56] make sure the version of a cluster is larger than the restore version, so future version stamps are larger than restored version stamps --- fdbclient/FileBackupAgent.actor.cpp | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) diff --git a/fdbclient/FileBackupAgent.actor.cpp b/fdbclient/FileBackupAgent.actor.cpp index ab6604e42b..6efef2b5d2 100644 --- a/fdbclient/FileBackupAgent.actor.cpp +++ b/fdbclient/FileBackupAgent.actor.cpp @@ -3275,6 +3275,25 @@ namespace fileBackup { } } + tr->reset(); + loop { + try { + tr->setOption(FDBTransactionOptions::ACCESS_SYSTEM_KEYS); + tr->setOption(FDBTransactionOptions::LOCK_AWARE); + Version destVersion = wait(tr->getReadVersion()); + TraceEvent("FileRestoreVersionUpgrade").detail("RestoreVersion", restoreVersion).detail("Dest", destVersion); + if (destVersion <= restoreVersion) { + TEST(true); // Forcing restored cluster to higher version + tr->set(minRequiredCommitVersionKey, BinaryWriter::toValue(restoreVersion+1, Unversioned())); + wait(tr->commit()); + } else { + break; + } + } catch( Error &e ) { + wait(tr->onError(e)); + } + } + Optional restorable = wait(bc->getRestoreSet(restoreVersion)); if(!restorable.present()) From 988add9fb5159bcb8163f7e117b20f8f0207a141 Mon Sep 17 00:00:00 2001 From: Evan Tschannen Date: Mon, 4 Mar 2019 16:48:34 -0800 Subject: [PATCH 12/56] cache the metadataVersion for commits, so that doing setVersion with the commit version of a different transaction will --- fdbclient/MasterProxyInterface.h | 5 +++-- fdbclient/NativeAPI.actor.cpp | 4 ++++ fdbserver/MasterProxyServer.actor.cpp | 2 +- 3 files changed, 8 insertions(+), 3 deletions(-) diff --git a/fdbclient/MasterProxyInterface.h b/fdbclient/MasterProxyInterface.h index e549aa5be1..8640d71ce3 100644 --- a/fdbclient/MasterProxyInterface.h +++ b/fdbclient/MasterProxyInterface.h @@ -71,14 +71,15 @@ struct MasterProxyInterface { struct CommitID { Version version; // returns invalidVersion if transaction conflicts uint16_t txnBatchId; + Optional metadataVersion; template void serialize(Ar& ar) { - serializer(ar, version, txnBatchId); + serializer(ar, version, txnBatchId, metadataVersion); } CommitID() : version(invalidVersion), txnBatchId(0) {} - CommitID( Version version, uint16_t txnBatchId ) : version(version), txnBatchId(txnBatchId) {} + CommitID( Version version, uint16_t txnBatchId, const Optional& metadataVersion ) : version(version), txnBatchId(txnBatchId), metadataVersion(metadataVersion) {} }; struct CommitTransactionRequest : TimedRequest { diff --git a/fdbclient/NativeAPI.actor.cpp b/fdbclient/NativeAPI.actor.cpp index c4a0c1f17c..115a9264fd 100644 --- a/fdbclient/NativeAPI.actor.cpp +++ b/fdbclient/NativeAPI.actor.cpp @@ -2528,6 +2528,10 @@ ACTOR static Future tryCommit( Database cx, Reference if (info.debugID.present()) TraceEvent(interval.end()).detail("CommittedVersion", v); *pCommittedVersion = v; + if(v > cx->metadataVersionCache[cx->mvCacheInsertLocation].first) { + cx->mvCacheInsertLocation = (cx->mvCacheInsertLocation + 1)%cx->metadataVersionCache.size(); + cx->metadataVersionCache[cx->mvCacheInsertLocation] = std::make_pair(v, ci.metadataVersion); + } Standalone ret = makeString(10); placeVersionstamp(mutateString(ret), v, ci.txnBatchId); diff --git a/fdbserver/MasterProxyServer.actor.cpp b/fdbserver/MasterProxyServer.actor.cpp index 1b61a478b9..fab3f7c99e 100644 --- a/fdbserver/MasterProxyServer.actor.cpp +++ b/fdbserver/MasterProxyServer.actor.cpp @@ -980,7 +980,7 @@ ACTOR Future commitBatch( for (int t = 0; t < trs.size(); t++) { if (committed[t] == ConflictBatch::TransactionCommitted && (!locked || trs[t].isLockAware())) { ASSERT_WE_THINK(commitVersion != invalidVersion); - trs[t].reply.send(CommitID(commitVersion, t)); + trs[t].reply.send(CommitID(commitVersion, t, metadataVersionAfter)); } else if (committed[t] == ConflictBatch::TransactionTooOld) { trs[t].reply.sendError(transaction_too_old()); From 3d196c9e97a0f8e9d726ea477bc2c312baaa070f Mon Sep 17 00:00:00 2001 From: Evan Tschannen Date: Mon, 4 Mar 2019 20:56:31 -0800 Subject: [PATCH 13/56] fix: metadataVersionKey log range removal needs to be checked for each logDestination --- fdbserver/ApplyMetadataMutation.h | 24 ++++++++++++------------ 1 file changed, 12 insertions(+), 12 deletions(-) diff --git a/fdbserver/ApplyMetadataMutation.h b/fdbserver/ApplyMetadataMutation.h index d7f3ebc42b..0788956323 100644 --- a/fdbserver/ApplyMetadataMutation.h +++ b/fdbserver/ApplyMetadataMutation.h @@ -421,20 +421,20 @@ static void applyMetadataMutations(UID const& dbgid, Arena &arena, VectorRefintersectingRanges(normalKeys)) { - if(it.value().count(logDestination) > 0) { - foundKey = true; - break; + bool foundKey = false; + for(auto &it : vecBackupKeys->intersectingRanges(normalKeys)) { + if(it.value().count(logDestination) > 0) { + foundKey = true; + break; + } } - } - if(!foundKey) { - auto logRanges = vecBackupKeys->modify(singleKeyRange(metadataVersionKey)); - for (auto logRange : logRanges) { - auto &logRangeMap = logRange->value(); - logRangeMap.erase(logDestination); + if(!foundKey) { + auto logRanges = vecBackupKeys->modify(singleKeyRange(metadataVersionKey)); + for (auto logRange : logRanges) { + auto &logRangeMap = logRange->value(); + logRangeMap.erase(logDestination); + } } } From d3377722d5a493742d32629c2652ea28a0e9c27d Mon Sep 17 00:00:00 2001 From: Stephen Atherton Date: Tue, 5 Mar 2019 04:00:11 -0800 Subject: [PATCH 14/56] Added blob store Backup URL parameter 'header' which enables addition of custom HTTP header fields to blob store HTTP requests. Added 'fdbbackup modify' command line tool for changing the backup URL and parameters, default snapshot interval, and/or current snapshot interval of a running backup. --- documentation/sphinx/source/backups.rst | 8 +- fdbbackup/backup.actor.cpp | 189 ++++++++++++++++++++---- fdbclient/BackupAgent.h | 5 - fdbclient/BackupContainer.actor.cpp | 1 + fdbclient/BlobStore.actor.cpp | 52 ++++++- fdbclient/BlobStore.h | 5 +- fdbclient/FileBackupAgent.actor.cpp | 4 +- fdbclient/HTTP.actor.cpp | 2 +- fdbclient/Knobs.cpp | 2 +- flow/Arena.h | 16 ++ 10 files changed, 239 insertions(+), 45 deletions(-) diff --git a/documentation/sphinx/source/backups.rst b/documentation/sphinx/source/backups.rst index 009dad0da5..339ee72029 100644 --- a/documentation/sphinx/source/backups.rst +++ b/documentation/sphinx/source/backups.rst @@ -100,7 +100,7 @@ If is not specified, it will be looked up in :ref:`blob credential sour An example blob store Backup URL would be ``blobstore://myKey:mySecret@something.domain.com:80/dec_1_2017_0400?bucket=backups``. -Blob store Backup URLs can have optional parameters at the end which set various limits on interactions with the blob store. All values must be positive decimal integers. The default values are not very restrictive. The most likely parameter a user would want to change is ``max_send_bytes_per_second`` (or ``sbps`` for short) which determines the upload speed to the blob service. +Blob store Backup URLs can have optional parameters at the end which set various limits or options used when communicating with the store. All values must be positive decimal integers unless otherwise specified. The speed related default values are not very restrictive. The most likely parameter a user would want to change is ``max_send_bytes_per_second`` (or ``sbps`` for short) which determines the upload speed to the blob service. Here is a complete list of valid parameters: @@ -130,7 +130,11 @@ Here is a complete list of valid parameters: *max_send_bytes_per_second* (or *sbps*) - Max send bytes per second for all requests combined. - *max_recv_bytes_per_second* (or *rbps*) - Max receive bytes per second for all requests combined + *max_recv_bytes_per_second* (or *rbps*) - Max receive bytes per second for all requests combined. + + *header* - Add an additional HTTP header to each blob store REST API request. Can be specified multiple times. Format is *header=:* where both strings are non-empty. + + **Example**: The URL parameter *header=x-amz-storage-class:REDUCED_REDUNDANCY* would send the HTTP header required to use the reduced redundancy storage option in the S3 API. .. _blob-credential-files: diff --git a/fdbbackup/backup.actor.cpp b/fdbbackup/backup.actor.cpp index 7bef1a33a5..f704f44124 100644 --- a/fdbbackup/backup.actor.cpp +++ b/fdbbackup/backup.actor.cpp @@ -76,7 +76,7 @@ enum enumProgramExe { }; enum enumBackupType { - BACKUP_UNDEFINED=0, BACKUP_START, BACKUP_STATUS, BACKUP_ABORT, BACKUP_WAIT, BACKUP_DISCONTINUE, BACKUP_PAUSE, BACKUP_RESUME, BACKUP_EXPIRE, BACKUP_DELETE, BACKUP_DESCRIBE, BACKUP_LIST, BACKUP_DUMP + BACKUP_UNDEFINED=0, BACKUP_START, BACKUP_MODIFY, BACKUP_STATUS, BACKUP_ABORT, BACKUP_WAIT, BACKUP_DISCONTINUE, BACKUP_PAUSE, BACKUP_RESUME, BACKUP_EXPIRE, BACKUP_DELETE, BACKUP_DESCRIBE, BACKUP_LIST, BACKUP_DUMP }; enum enumDBType { @@ -99,6 +99,9 @@ enum { // Backup and Restore constants OPT_TAGNAME, OPT_BACKUPKEYS, OPT_WAITFORDONE, + // Backup Modify + OPT_MOD_ACTIVE_INTERVAL, OPT_MOD_VERIFY_UID, + // Restore constants OPT_RESTORECONTAINER, OPT_DBVERSION, OPT_PREFIX_ADD, OPT_PREFIX_REMOVE, @@ -181,6 +184,41 @@ CSimpleOpt::SOption g_rgBackupStartOptions[] = { SO_END_OF_OPTIONS }; +CSimpleOpt::SOption g_rgBackupModifyOptions[] = { +#ifdef _WIN32 + { OPT_PARENTPID, "--parentpid", SO_REQ_SEP }, +#endif + { OPT_TRACE, "--log", SO_NONE }, + { OPT_TRACE_DIR, "--logdir", SO_REQ_SEP }, + { OPT_TRACE_LOG_GROUP, "--loggroup", SO_REQ_SEP }, + { OPT_QUIET, "-q", SO_NONE }, + { OPT_QUIET, "--quiet", SO_NONE }, + { OPT_VERSION, "-v", SO_NONE }, + { OPT_VERSION, "--version", SO_NONE }, + { OPT_CRASHONERROR, "--crash", SO_NONE }, + { OPT_MEMLIMIT, "-m", SO_REQ_SEP }, + { OPT_MEMLIMIT, "--memory", SO_REQ_SEP }, + { OPT_HELP, "-?", SO_NONE }, + { OPT_HELP, "-h", SO_NONE }, + { OPT_HELP, "--help", SO_NONE }, + { OPT_DEVHELP, "--dev-help", SO_NONE }, + { OPT_BLOB_CREDENTIALS, "--blob_credentials", SO_REQ_SEP }, + { OPT_KNOB, "--knob_", SO_REQ_SEP }, + { OPT_CLUSTERFILE, "-C", SO_REQ_SEP }, + { OPT_CLUSTERFILE, "--cluster_file", SO_REQ_SEP }, + { OPT_TAGNAME, "-t", SO_REQ_SEP }, + { OPT_TAGNAME, "--tagname", SO_REQ_SEP }, + { OPT_MOD_VERIFY_UID, "--verify_uid", SO_REQ_SEP }, + { OPT_DESTCONTAINER, "-d", SO_REQ_SEP }, + { OPT_DESTCONTAINER, "--destcontainer", SO_REQ_SEP }, + { OPT_SNAPSHOTINTERVAL, "-s", SO_REQ_SEP }, + { OPT_SNAPSHOTINTERVAL, "--snapshot_interval", SO_REQ_SEP }, + { OPT_MOD_ACTIVE_INTERVAL, "-as", SO_REQ_SEP }, + { OPT_MOD_ACTIVE_INTERVAL, "--active_snapshot_interval", SO_REQ_SEP }, + + SO_END_OF_OPTIONS +}; + CSimpleOpt::SOption g_rgBackupStatusOptions[] = { #ifdef _WIN32 { OPT_PARENTPID, "--parentpid", SO_REQ_SEP }, @@ -1027,6 +1065,7 @@ enumBackupType getBackupType(std::string backupType) values["describe"] = BACKUP_DESCRIBE; values["list"] = BACKUP_LIST; values["dump"] = BACKUP_DUMP; + values["modify"] = BACKUP_MODIFY; } auto i = values.find(backupType); @@ -1787,6 +1826,31 @@ ACTOR Future changeDBBackupResumed(Database src, Database dest, bool pause return Void(); } +Reference openBackupContainer(const char *name, std::string destinationContainer) { + // Error, if no dest container was specified + if (destinationContainer.empty()) { + fprintf(stderr, "ERROR: No backup destination was specified.\n"); + printHelpTeaser(name); + throw backup_error(); + } + + Reference c; + try { + c = IBackupContainer::openContainer(destinationContainer); + } + catch (Error& e) { + std::string msg = format("ERROR: '%s' on URL '%s'", e.what(), destinationContainer.c_str()); + if(e.code() == error_code_backup_invalid_url && !IBackupContainer::lastOpenError.empty()) { + msg += format(": %s", IBackupContainer::lastOpenError.c_str()); + } + fprintf(stderr, "%s\n", msg.c_str()); + printHelpTeaser(name); + throw; + } + + return c; +} + ACTOR Future runRestore(Database db, std::string tagName, std::string container, Standalone> ranges, Version targetVersion, bool performRestore, bool verbose, bool waitForDone, std::string addPrefix, std::string removePrefix) { try { @@ -1799,7 +1863,7 @@ ACTOR Future runRestore(Database db, std::string tagName, std::string cont state KeyRange range = (ranges.size() == 0) ? normalKeys : ranges.front(); - state Reference bc = IBackupContainer::openContainer(container); + state Reference bc = openBackupContainer(exeRestore.toString().c_str(), container); // If targetVersion is unset then use the maximum restorable version from the backup description if(targetVersion == invalidVersion) { @@ -1849,30 +1913,6 @@ ACTOR Future runRestore(Database db, std::string tagName, std::string cont return Void(); } -Reference openBackupContainer(const char *name, std::string destinationContainer) { - // Error, if no dest container was specified - if (destinationContainer.empty()) { - fprintf(stderr, "ERROR: No backup destination was specified.\n"); - printHelpTeaser(name); - throw backup_error(); - } - - std::string error; - Reference c; - try { - c = IBackupContainer::openContainer(destinationContainer); - } - catch (Error& e) { - if(!error.empty()) - error = std::string("[") + error + "]"; - fprintf(stderr, "ERROR (%s) on %s %s\n", e.what(), destinationContainer.c_str(), error.c_str()); - printHelpTeaser(name); - throw; - } - - return c; -} - ACTOR Future dumpBackupData(const char *name, std::string destinationContainer, Version beginVersion, Version endVersion) { state Reference c = openBackupContainer(name, destinationContainer); @@ -2023,13 +2063,76 @@ ACTOR Future listBackup(std::string baseUrl) { } } catch (Error& e) { - fprintf(stderr, "ERROR: %s\n", e.what()); + std::string msg = format("ERROR: %s", e.what()); + if(e.code() == error_code_backup_invalid_url && !IBackupContainer::lastOpenError.empty()) { + msg += format(": %s", IBackupContainer::lastOpenError.c_str()); + } + fprintf(stderr, "%s\n", msg.c_str()); throw; } return Void(); } +struct BackupModifyOptions { + Optional verifyUID; + Optional destURL; + Optional snapshotInterval; + Optional activeSnapshotInterval; +}; + +ACTOR Future modifyBackup(Database db, std::string tagName, BackupModifyOptions options) { + state KeyBackedTag tag = makeBackupTag(tagName); + + state Reference tr(new ReadYourWritesTransaction()); + loop { + try { + tr->setOption(FDBTransactionOptions::ACCESS_SYSTEM_KEYS); + tr->setOption(FDBTransactionOptions::LOCK_AWARE); + + state Optional uidFlag = wait(tag.get(db)); + if(uidFlag.get().second) { + fprintf(stderr, "Cannot modify aborted backup on tag '%s'\n", tagName.c_str()); + throw backup_error(); + } + + state BackupConfig config(uidFlag.get().first); + EBackupState s = wait(config.stateEnum().getOrThrow(tr, false, backup_invalid_info())); + if(!FileBackupAgent::isRunnable(s)) { + fprintf(stderr, "Backup on tag '%s' is not runnable.\n", tagName.c_str()); + throw backup_error(); + } + + if(options.verifyUID.present() && options.verifyUID.get() != uidFlag.get().first.toString()) { + fprintf(stderr, "UID verification failed, backup on tag '%s' is '%s' but '%s' was specified.\n", tagName.c_str(), uidFlag.get().first.toString().c_str(), options.verifyUID.get().c_str()); + throw backup_error(); + } + + if(options.snapshotInterval.present()) { + config.snapshotIntervalSeconds().set(tr, options.snapshotInterval.get()); + } + + if(options.activeSnapshotInterval.present()) { + Version begin = wait(config.snapshotBeginVersion().getOrThrow(tr, false, backup_error())); + config.snapshotTargetEndVersion().set(tr, begin); + } + + if(options.destURL.present()) { + Reference bc = openBackupContainer(exeBackup.toString().c_str(), options.destURL.get()); + config.backupContainer().set(tr, bc); + } + + Void _ = wait(tr->commit()); + break; + } + catch (Error& e) { + Void _ = wait(tr->onError(e)); + } + } + + return Void(); +} + static std::vector> parseLine(std::string &line, bool& err, bool& partial) { err = false; @@ -2287,6 +2390,9 @@ int main(int argc, char* argv[]) { case BACKUP_LIST: args = new CSimpleOpt(argc - 1, &argv[1], g_rgBackupListOptions, SO_O_EXACT); break; + case BACKUP_MODIFY: + args = new CSimpleOpt(argc - 1, &argv[1], g_rgBackupModifyOptions, SO_O_EXACT); + break; case BACKUP_UNDEFINED: default: // Display help, if requested @@ -2424,6 +2530,8 @@ int main(int argc, char* argv[]) { Version dumpBegin = 0; Version dumpEnd = std::numeric_limits::max(); + BackupModifyOptions modifyOptions; + if( argc == 1 ) { printUsage(programExe, false); return FDB_EXIT_ERROR; @@ -2593,16 +2701,30 @@ int main(int argc, char* argv[]) { // If the url starts with '/' then prepend "file://" for backwards compatibility if(StringRef(destinationContainer).startsWith(LiteralStringRef("/"))) destinationContainer = std::string("file://") + destinationContainer; + modifyOptions.destURL = destinationContainer; break; - case OPT_SNAPSHOTINTERVAL: { + case OPT_SNAPSHOTINTERVAL: + case OPT_MOD_ACTIVE_INTERVAL: + { const char* a = args->OptionArg(); - if (!sscanf(a, "%d", &snapshotIntervalSeconds)) { + int seconds; + if (!sscanf(a, "%d", &seconds)) { fprintf(stderr, "ERROR: Could not parse snapshot interval `%s'\n", a); printHelpTeaser(argv[0]); return FDB_EXIT_ERROR; } + if(optId == OPT_SNAPSHOTINTERVAL) { + snapshotIntervalSeconds = seconds; + modifyOptions.snapshotInterval = seconds; + } + else if(optId == OPT_MOD_ACTIVE_INTERVAL) { + modifyOptions.activeSnapshotInterval = seconds; + } break; } + case OPT_MOD_VERIFY_UID: + modifyOptions.verifyUID = args->OptionArg(); + break; case OPT_WAITFORDONE: waitForDone = true; break; @@ -2947,6 +3069,15 @@ int main(int argc, char* argv[]) { break; } + case BACKUP_MODIFY: + { + if(!initCluster()) + return FDB_EXIT_ERROR; + + f = stopAfter( modifyBackup(db, tagName, modifyOptions) ); + break; + } + case BACKUP_STATUS: if(!initCluster()) return FDB_EXIT_ERROR; diff --git a/fdbclient/BackupAgent.h b/fdbclient/BackupAgent.h index e0dbe219f3..4868b57a5b 100644 --- a/fdbclient/BackupAgent.h +++ b/fdbclient/BackupAgent.h @@ -702,11 +702,6 @@ public: return configSpace.pack(LiteralStringRef(__FUNCTION__)); } - // Get the backup container URL only without creating a backup container instance. - KeyBackedProperty> backupContainerURL() { - return configSpace.pack(LiteralStringRef("backupContainer")); - } - // Stop differntial logging if already started or don't start after completing KV ranges KeyBackedProperty stopWhenDone() { return configSpace.pack(LiteralStringRef(__FUNCTION__)); diff --git a/fdbclient/BackupContainer.actor.cpp b/fdbclient/BackupContainer.actor.cpp index 6a13f7ef7c..95af3f422f 100644 --- a/fdbclient/BackupContainer.actor.cpp +++ b/fdbclient/BackupContainer.actor.cpp @@ -1329,6 +1329,7 @@ public: continue; } TraceEvent(SevWarn, "BackupContainerBlobStoreInvalidParameter").detail("Name", printable(kv.first)).detail("Value", printable(kv.second)); + IBackupContainer::lastOpenError = format("Unknown URL parameter: '%s'", kv.first.c_str()); throw backup_invalid_url(); } } diff --git a/fdbclient/BlobStore.actor.cpp b/fdbclient/BlobStore.actor.cpp index d222491695..1bcaad3737 100644 --- a/fdbclient/BlobStore.actor.cpp +++ b/fdbclient/BlobStore.actor.cpp @@ -146,10 +146,13 @@ Reference BlobStoreEndpoint::fromString(std::string const &ur StringRef t(url); StringRef prefix = t.eat("://"); if(prefix != LiteralStringRef("blobstore")) - throw std::string("Invalid blobstore URL."); + throw format("Invalid blobstore URL prefix '%s'", prefix.toString().c_str()); StringRef cred = t.eat("@"); - StringRef hostPort = t.eat("/"); - StringRef resource = t.eat("?"); + StringRef hostPort = t.eatAny("?/"); + StringRef resource; + if(t[-1] == '/') { + resource = t.eat("?"); + } // hostPort is at least a host or IP address, optionally followed by :portNumber or :serviceName StringRef h(hostPort); @@ -160,12 +163,31 @@ Reference BlobStoreEndpoint::fromString(std::string const &ur StringRef service = h.eat(); BlobKnobs knobs; + HTTP::Headers extraHeaders; while(1) { StringRef name = t.eat("="); if(name.size() == 0) break; StringRef value = t.eat("&"); + // Special case for header + if(name == LiteralStringRef("header")) { + StringRef originalValue = value; + StringRef headerFieldName = value.eat(":"); + StringRef headerFieldValue = value; + if(headerFieldName.size() == 0 || headerFieldValue.size() == 0) { + throw format("'%s' is not a valid value for '%s' parameter. Format is : where strings are not empty.", originalValue.toString().c_str(), name.toString().c_str()); + } + std::string &fieldValue = extraHeaders[headerFieldName.toString()]; + // RFC 2616 section 4.2 says header field names can repeat but only if it is valid to concatenate their values with comma separation + if(!fieldValue.empty()) { + fieldValue.append(","); + } + fieldValue.append(headerFieldValue.toString()); + continue; + } + + // See if the parameter is a knob // First try setting a dummy value (all knobs are currently numeric) just to see if this parameter is known to BlobStoreEndpoint. // If it is, then we will set it to a good value or throw below, so the dummy set has no bad side effects. bool known = knobs.set(name, 0); @@ -196,7 +218,7 @@ Reference BlobStoreEndpoint::fromString(std::string const &ur StringRef key = c.eat(":"); StringRef secret = c.eat(); - return Reference(new BlobStoreEndpoint(host.toString(), service.toString(), key.toString(), secret.toString(), knobs)); + return Reference(new BlobStoreEndpoint(host.toString(), service.toString(), key.toString(), secret.toString(), knobs, extraHeaders)); } catch(std::string &err) { if(error != nullptr) @@ -220,8 +242,20 @@ std::string BlobStoreEndpoint::getResourceURL(std::string resource) { std::string r = format("blobstore://%s%s@%s/%s", key.c_str(), s.c_str(), hostPort.c_str(), resource.c_str()); std::string p = knobs.getURLParameters(); + + for(auto &kv : extraHeaders) { + if(!p.empty()) { + p.append("&"); + } + p.append("header="); + p.append(HTTP::urlEncode(kv.first)); + p.append(":"); + p.append(HTTP::urlEncode(kv.second)); + } + if(!p.empty()) r.append("?").append(p); + return r; } @@ -494,6 +528,16 @@ ACTOR Future> doRequest_impl(Referencehost; headers["Accept"] = "application/xml"; + + // Merge extraHeaders into headers + for(auto &kv : bstore->extraHeaders) { + std::string &fieldValue = headers[kv.first]; + if(!fieldValue.empty()) { + fieldValue.append(","); + } + fieldValue.append(kv.second); + } + Void _ = wait(bstore->concurrentRequests.take()); state FlowLock::Releaser globalReleaser(bstore->concurrentRequests, 1); diff --git a/fdbclient/BlobStore.h b/fdbclient/BlobStore.h index a2d3e4d33c..5b88c8f023 100644 --- a/fdbclient/BlobStore.h +++ b/fdbclient/BlobStore.h @@ -102,8 +102,8 @@ public: } }; - BlobStoreEndpoint(std::string const &host, std::string service, std::string const &key, std::string const &secret, BlobKnobs const &knobs = BlobKnobs()) - : host(host), service(service), key(key), secret(secret), lookupSecret(secret.empty()), knobs(knobs), + BlobStoreEndpoint(std::string const &host, std::string service, std::string const &key, std::string const &secret, BlobKnobs const &knobs = BlobKnobs(), HTTP::Headers extraHeaders = HTTP::Headers()) + : host(host), service(service), key(key), secret(secret), lookupSecret(secret.empty()), knobs(knobs), extraHeaders(extraHeaders), requestRate(new SpeedLimit(knobs.requests_per_second, 1)), requestRateList(new SpeedLimit(knobs.list_requests_per_second, 1)), requestRateWrite(new SpeedLimit(knobs.write_requests_per_second, 1)), @@ -150,6 +150,7 @@ public: std::string secret; bool lookupSecret; BlobKnobs knobs; + HTTP::Headers extraHeaders; // Speed and concurrency limits Reference requestRate; diff --git a/fdbclient/FileBackupAgent.actor.cpp b/fdbclient/FileBackupAgent.actor.cpp index 4a70660b17..f1b5091634 100755 --- a/fdbclient/FileBackupAgent.actor.cpp +++ b/fdbclient/FileBackupAgent.actor.cpp @@ -1299,6 +1299,7 @@ namespace fileBackup { state Reference lock(new FlowLock(CLIENT_KNOBS->BACKUP_LOCK_BYTES)); Void _ = wait(checkTaskVersion(cx, task, name, version)); + state double startTime = timer(); state Reference tr(new ReadYourWritesTransaction(cx)); // The shard map will use 3 values classes. Exactly SKIP, exactly DONE, then any number >= NOT_DONE_MIN which will mean not done. @@ -1689,7 +1690,8 @@ namespace fileBackup { .detail("SnapshotBeginVersion", snapshotBeginVersion) .detail("SnapshotTargetEndVersion", snapshotTargetEndVersion) .detail("CurrentVersion", recentReadVersion) - .detail("SnapshotIntervalSeconds", snapshotIntervalSeconds); + .detail("SnapshotIntervalSeconds", snapshotIntervalSeconds) + .detail("DispatchTimeSeconds", timer() - startTime); Params.snapshotFinished().set(task, true); } diff --git a/fdbclient/HTTP.actor.cpp b/fdbclient/HTTP.actor.cpp index b8e497acf0..735bc8874c 100644 --- a/fdbclient/HTTP.actor.cpp +++ b/fdbclient/HTTP.actor.cpp @@ -31,7 +31,7 @@ namespace HTTP { o.reserve(s.size() * 3); char buf[4]; for(auto c : s) - if(std::isalnum(c) || c == '?' || c == '/' || c == '-' || c == '_' || c == '.') + if(std::isalnum(c) || c == '?' || c == '/' || c == '-' || c == '_' || c == '.' || c == ',' || c == ':') o.append(&c, 1); else { sprintf(buf, "%%%.02X", c); diff --git a/fdbclient/Knobs.cpp b/fdbclient/Knobs.cpp index d68e445d2c..61e31bd26b 100644 --- a/fdbclient/Knobs.cpp +++ b/fdbclient/Knobs.cpp @@ -104,7 +104,7 @@ ClientKnobs::ClientKnobs(bool randomize) { init( BACKUP_LOCK_BYTES, 1e8 ); init( BACKUP_RANGE_TIMEOUT, TASKBUCKET_TIMEOUT_VERSIONS/CORE_VERSIONSPERSECOND/2.0 ); init( BACKUP_RANGE_MINWAIT, std::max(1.0, BACKUP_RANGE_TIMEOUT/2.0)); - init( BACKUP_SNAPSHOT_DISPATCH_INTERVAL_SEC, 60 * 60 ); // 1 hour + init( BACKUP_SNAPSHOT_DISPATCH_INTERVAL_SEC, 10 * 60 ); // 10 minutes init( BACKUP_DEFAULT_SNAPSHOT_INTERVAL_SEC, 3600 * 24 * 10); // 10 days init( BACKUP_SHARD_TASK_LIMIT, 1000 ); if( randomize && BUGGIFY ) BACKUP_SHARD_TASK_LIMIT = 4; init( BACKUP_AGGREGATE_POLL_RATE_UPDATE_INTERVAL, 60); diff --git a/flow/Arena.h b/flow/Arena.h index b776b72289..b359f8c10b 100644 --- a/flow/Arena.h +++ b/flow/Arena.h @@ -499,6 +499,22 @@ public: StringRef eat(const char *sep) { return eat(StringRef((const uint8_t *)sep, strlen(sep))); } + // Return StringRef of bytes from begin() up to but not including the first byte matching any byte in sep, + // and remove that sequence (including the sep byte) from *this + // Returns and removes all bytes from *this if no bytes within sep were found + StringRef eatAny(StringRef sep) { + auto iSep = std::find_first_of(begin(), end(), sep.begin(), sep.end()); + if(iSep != end()) { + const int i = iSep - begin(); + StringRef token = substr(0, i); + *this = substr(i + 1); + return token; + } + return eat(); + } + StringRef eatAny(const char *sep) { + return eatAny(StringRef((const uint8_t *)sep, strlen(sep))); + } private: // Unimplemented; blocks conversion through std::string From f6559bbb11bd824340f9d09290282c85c759ce57 Mon Sep 17 00:00:00 2001 From: Stephen Atherton Date: Tue, 5 Mar 2019 05:13:48 -0800 Subject: [PATCH 15/56] Added help and documentation for fdbbackup modify. Add BackupUID and more parseable BackupURL to fdbbackup status output. --- documentation/sphinx/source/backups.rst | 25 ++++++++++++++++++++++++- fdbbackup/backup.actor.cpp | 8 ++++++-- fdbclient/FileBackupAgent.actor.cpp | 2 ++ 3 files changed, 32 insertions(+), 3 deletions(-) diff --git a/documentation/sphinx/source/backups.rst b/documentation/sphinx/source/backups.rst index 339ee72029..1679924290 100644 --- a/documentation/sphinx/source/backups.rst +++ b/documentation/sphinx/source/backups.rst @@ -39,7 +39,7 @@ Tools There are 5 command line tools for working with Backup and DR operations: ``fdbbackup`` - This command line tool is used to control (but not execute) backup jobs and manage backup data. It can ``start`` or ``abort`` a backup, ``discontinue`` a continuous backup, get the ``status`` of an ongoing backup, or ``wait`` for a backup to complete. It can also ``describe``, ``delete``, ``expire`` data in a backup, or ``list`` the backups at a destination folder URL. + This command line tool is used to control (but not execute) backup jobs and manage backup data. It can ``start``, ``modify`` or ``abort`` a backup, ``discontinue`` a continuous backup, get the ``status`` of an ongoing backup, or ``wait`` for a backup to complete. It can also ``describe``, ``delete``, ``expire`` data in a backup, or ``list`` the backups at a destination folder URL. ``fdbrestore`` This command line tool is used to control (but not execute) restore jobs. It can ``start`` or ``abort`` a restore, get the ``status`` of current and recent restore tasks, or ``wait`` for a restore task to complete while printing ongoing progress details. @@ -235,6 +235,29 @@ The ``start`` subcommand is used to start a backup. If there is already a backu user@host$ fdbbackup start -k 'apple bananna' -k 'mango pineapple' -d user@host$ fdbbackup start -k '@pp1e b*n*nn*' -k '#an&0 p^n3app!e' -d +.. program:: fdbbackup modify + +``modify`` +--------- + +The ``modify`` subcommand is used to modify parameters of a running backup. All specified changes are made in a single transaction. + +:: + + user@host$ fdbbackup modify [-t ] [-d ] [-s ] [-as ] [--verify_uid ] + +``-d `` + Sets a new Backup URL for the backup to write to. This is most likely to be used to change only URL parameters or account information. However, it can also be used to start writing to a new destination mid-backup. The new old location will cease gaining any additional restorability, while the new location will not be restorable until a new snapshot begins and completes. Full restorability would be regained, however, if the contents of the two destinations were to be combined by the user. + +``-s `` or ``--snapshot_interval `` + Sets a new duration for backup snapshots, in seconds. + +``-as `` or ``--active_snapshot_interval `` + Sets new duration for the backup's currently active snapshot, in seconds, relative to the start of the snapshot. + +``--verify_uid `` + Specifies a UID to verify against the BackupUID of the running backup. If provided, the UID is verified in the same transaction which sets the new backup parameters. + .. program:: fdbbackup abort ``abort`` diff --git a/fdbbackup/backup.actor.cpp b/fdbbackup/backup.actor.cpp index f704f44124..b838a3cf08 100644 --- a/fdbbackup/backup.actor.cpp +++ b/fdbbackup/backup.actor.cpp @@ -802,7 +802,7 @@ static void printBackupUsage(bool devhelp) { " FDB_CLUSTER_FILE environment variable, then `./fdb.cluster',\n" " then `%s'.\n", platform::getDefaultClusterFilePath().c_str()); printf(" -d, --destcontainer URL\n" - " The Backup container URL for start, describe, expire, and delete operations.\n"); + " The Backup container URL for start, modify, describe, expire, and delete operations.\n"); printBackupContainerInfo(); printf(" -b, --base_url BASEURL\n" " Base backup URL for list operations. This looks like a Backup URL but without a backup name.\n"); @@ -827,7 +827,11 @@ static void printBackupUsage(bool devhelp) { printf(" For describe operations, lookup versions in the database to obtain timestamps. A cluster file is required.\n"); printf(" -f, --force For expire operations, force expiration even if minimum restorability would be violated.\n"); printf(" -s, --snapshot_interval DURATION\n" - " For start operations, specifies the backup's target snapshot interval as DURATION seconds. Defaults to %d.\n", CLIENT_KNOBS->BACKUP_DEFAULT_SNAPSHOT_INTERVAL_SEC); + " For start or modify operations, specifies the backup's default target snapshot interval as DURATION seconds. Defaults to %d for start operations.\n", CLIENT_KNOBS->BACKUP_DEFAULT_SNAPSHOT_INTERVAL_SEC); + printf(" -as, --active_snapshot_interval DURATION\n" + " For modify operations, sets the desired interval for the backup's currently active snapshot, relative to the start of the snapshot.\n"); + printf(" --verify_uid UID\n" + " Specifies a UID to verify against the BackupUID of the running backup. If provided, the UID is verified in the same transaction which sets the new backup parameters.\n"); printf(" -e ERRORLIMIT The maximum number of errors printed by status (default is 10).\n"); printf(" -k KEYS List of key ranges to backup.\n" " If not specified, the entire database will be backed up.\n"); diff --git a/fdbclient/FileBackupAgent.actor.cpp b/fdbclient/FileBackupAgent.actor.cpp index f1b5091634..346257164e 100755 --- a/fdbclient/FileBackupAgent.actor.cpp +++ b/fdbclient/FileBackupAgent.actor.cpp @@ -3803,6 +3803,8 @@ public: statusText += "The previous backup on tag `" + tagName + "' at " + bc->getURL() + " " + backupStatus + ".\n"; break; } + statusText += format("BackupUID: %s\n", uidAndAbortedFlag.get().first.toString().c_str()); + statusText += format("BackupURL: %s\n", bc->getURL().c_str()); if(snapshotProgress) { state int64_t snapshotInterval; From a5b53d3c0f20f71784f73920f36b7b37d35d16d1 Mon Sep 17 00:00:00 2001 From: Stephen Atherton Date: Tue, 5 Mar 2019 10:44:14 -0800 Subject: [PATCH 16/56] More documentation and release note changes for fdbbackup modify. Added release note for #1205 bug fix which was missed. --- documentation/sphinx/source/backups.rst | 2 +- documentation/sphinx/source/release-notes.rst | 7 +++++++ fdbbackup/backup.actor.cpp | 3 ++- 3 files changed, 10 insertions(+), 2 deletions(-) diff --git a/documentation/sphinx/source/backups.rst b/documentation/sphinx/source/backups.rst index 1679924290..bef728d200 100644 --- a/documentation/sphinx/source/backups.rst +++ b/documentation/sphinx/source/backups.rst @@ -256,7 +256,7 @@ The ``modify`` subcommand is used to modify parameters of a running backup. All Sets new duration for the backup's currently active snapshot, in seconds, relative to the start of the snapshot. ``--verify_uid `` - Specifies a UID to verify against the BackupUID of the running backup. If provided, the UID is verified in the same transaction which sets the new backup parameters. + Specifies a UID to verify against the BackupUID of the running backup. If provided, the UID is verified in the same transaction which sets the new backup parameters (if the UID matches). .. program:: fdbbackup abort diff --git a/documentation/sphinx/source/release-notes.rst b/documentation/sphinx/source/release-notes.rst index 51345e3151..cd71f2ee11 100644 --- a/documentation/sphinx/source/release-notes.rst +++ b/documentation/sphinx/source/release-notes.rst @@ -12,6 +12,13 @@ Fixes * The ``include`` command in fdbcli would falsly include all machines with IP addresses that have the included IP address as a prefix (for example ``include 1.0.0.1`` would also include ``1.0.0.10``). `(PR #1121) `_ +* Restore could crash when reading a file that ends on a block boundary (1MB default). `(PR #1205) `_ + +Features +-------- + +* Added ``modify`` command to fdbbackup for modifying parameters of a running backup. `(PR #1237) `_ +* Added 'header' parameter to blobstore backup URLs for setting custom HTTP headers. `(PR #1237) `_ 6.0.18 ====== diff --git a/fdbbackup/backup.actor.cpp b/fdbbackup/backup.actor.cpp index b838a3cf08..f7b26bf981 100644 --- a/fdbbackup/backup.actor.cpp +++ b/fdbbackup/backup.actor.cpp @@ -831,7 +831,8 @@ static void printBackupUsage(bool devhelp) { printf(" -as, --active_snapshot_interval DURATION\n" " For modify operations, sets the desired interval for the backup's currently active snapshot, relative to the start of the snapshot.\n"); printf(" --verify_uid UID\n" - " Specifies a UID to verify against the BackupUID of the running backup. If provided, the UID is verified in the same transaction which sets the new backup parameters.\n"); + " Specifies a UID to verify against the BackupUID of the running backup. If provided, the UID is verified in the same transaction\n" + " which sets the new backup parameters (if the UID matches).\n"); printf(" -e ERRORLIMIT The maximum number of errors printed by status (default is 10).\n"); printf(" -k KEYS List of key ranges to backup.\n" " If not specified, the entire database will be backed up.\n"); From 5980276aea09888432eb2e90cb1e0e401dc72bd9 Mon Sep 17 00:00:00 2001 From: Stephen Atherton Date: Tue, 5 Mar 2019 11:56:36 -0800 Subject: [PATCH 17/56] An HTTP RequestID/ResponseID mismatch is logged as only a warning if the response code is 5xx and the responseID is empty. --- fdbclient/HTTP.actor.cpp | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/fdbclient/HTTP.actor.cpp b/fdbclient/HTTP.actor.cpp index b8e497acf0..ed08663f41 100644 --- a/fdbclient/HTTP.actor.cpp +++ b/fdbclient/HTTP.actor.cpp @@ -389,8 +389,15 @@ namespace HTTP { event.detail("RequestIDReceived", responseID); if(requestID != responseID) { err = http_bad_request_id(); + // Log a non-debug a error - TraceEvent(SevError, "HTTPRequestFailedIDMismatch") + Severity sev = SevError; + // If the response code is 5xx (server error) and the responseID is empty then just warn + if(responseID.empty() && r->code >= 500 && r->code < 600) { + sev = SevWarnAlways; + } + + TraceEvent(sev, "HTTPRequestFailedIDMismatch") .detail("DebugID", conn->getDebugID()) .detail("RemoteAddress", conn->getPeerAddress()) .detail("Verb", verb) From a57ddd29f2a01c27d0f1be811ab107a25b16b5ed Mon Sep 17 00:00:00 2001 From: Stephen Atherton Date: Tue, 5 Mar 2019 18:05:52 -0800 Subject: [PATCH 18/56] Bug fixes in fdbbackup modify command. Active snapshot interval was wrong. New backup destination containers were not being created. Renamed some variables for clarity. --- documentation/sphinx/source/backups.rst | 2 +- fdbbackup/backup.actor.cpp | 40 ++++++++++++++++++------- 2 files changed, 31 insertions(+), 11 deletions(-) diff --git a/documentation/sphinx/source/backups.rst b/documentation/sphinx/source/backups.rst index bef728d200..772a6e84f2 100644 --- a/documentation/sphinx/source/backups.rst +++ b/documentation/sphinx/source/backups.rst @@ -197,7 +197,7 @@ The following options apply to most subcommands: Path to the cluster file that should be used to connect to the FoundationDB cluster you want to use. If not specified, a :ref:`default cluster file ` will be used. ``-d `` - The Backup URL which the subcommand should read, write, or modify. For ``start`` operations, the Backup URL must be accessible by the ``backup_agent`` processes. + The Backup URL which the subcommand should read, write, or modify. For ``start`` and ``modify`` operations, the Backup URL must be accessible by the ``backup_agent`` processes. ``-t `` A "tag" is a named slot in which a backup task executes. Backups on different named tags make progress and are controlled independently, though their executions are handled by the same set of backup agent processes. Any number of unique backup tags can be active at once. It the tag is not specified, the default tag name "default" is used. diff --git a/fdbbackup/backup.actor.cpp b/fdbbackup/backup.actor.cpp index f7b26bf981..867e2aaba5 100644 --- a/fdbbackup/backup.actor.cpp +++ b/fdbbackup/backup.actor.cpp @@ -2082,14 +2082,35 @@ ACTOR Future listBackup(std::string baseUrl) { struct BackupModifyOptions { Optional verifyUID; Optional destURL; - Optional snapshotInterval; - Optional activeSnapshotInterval; + Optional snapshotIntervalSeconds; + Optional activeSnapshotIntervalSeconds; + bool hasChanges() const { + return destURL.present() || snapshotIntervalSeconds.present() || activeSnapshotIntervalSeconds.present(); + } }; ACTOR Future modifyBackup(Database db, std::string tagName, BackupModifyOptions options) { + if(!options.hasChanges()) { + fprintf(stderr, "No changes were specified, nothing to do!\n"); + throw backup_error(); + } + state KeyBackedTag tag = makeBackupTag(tagName); - state Reference tr(new ReadYourWritesTransaction()); + state Reference bc; + if(options.destURL.present()) { + bc = openBackupContainer(exeBackup.toString().c_str(), options.destURL.get()); + try { + Void _ = wait(timeoutError(bc->create(), 30)); + } catch(Error &e) { + if(e.code() == error_code_actor_cancelled) + throw; + fprintf(stderr, "ERROR: Could not create backup container at '%s': %s\n", options.destURL.get().c_str(), e.what()); + throw backup_error(); + } + } + + state Reference tr(new ReadYourWritesTransaction(db)); loop { try { tr->setOption(FDBTransactionOptions::ACCESS_SYSTEM_KEYS); @@ -2113,17 +2134,16 @@ ACTOR Future modifyBackup(Database db, std::string tagName, BackupModifyOp throw backup_error(); } - if(options.snapshotInterval.present()) { - config.snapshotIntervalSeconds().set(tr, options.snapshotInterval.get()); + if(options.snapshotIntervalSeconds.present()) { + config.snapshotIntervalSeconds().set(tr, options.snapshotIntervalSeconds.get()); } - if(options.activeSnapshotInterval.present()) { + if(options.activeSnapshotIntervalSeconds.present()) { Version begin = wait(config.snapshotBeginVersion().getOrThrow(tr, false, backup_error())); - config.snapshotTargetEndVersion().set(tr, begin); + config.snapshotTargetEndVersion().set(tr, begin + ((int64_t)options.activeSnapshotIntervalSeconds.get() * CLIENT_KNOBS->CORE_VERSIONSPERSECOND)); } if(options.destURL.present()) { - Reference bc = openBackupContainer(exeBackup.toString().c_str(), options.destURL.get()); config.backupContainer().set(tr, bc); } @@ -2720,10 +2740,10 @@ int main(int argc, char* argv[]) { } if(optId == OPT_SNAPSHOTINTERVAL) { snapshotIntervalSeconds = seconds; - modifyOptions.snapshotInterval = seconds; + modifyOptions.snapshotIntervalSeconds = seconds; } else if(optId == OPT_MOD_ACTIVE_INTERVAL) { - modifyOptions.activeSnapshotInterval = seconds; + modifyOptions.activeSnapshotIntervalSeconds = seconds; } break; } From 87d0c5bae0801dae717b8a320fa5a928fd11d0bc Mon Sep 17 00:00:00 2001 From: Stephen Atherton Date: Tue, 5 Mar 2019 21:14:21 -0800 Subject: [PATCH 19/56] Bug/usability fix: The output URLs of fdbbackup list were meant to be directly usable for backup management operations but they were missing the bucket URL parameter. --- fdbclient/BackupContainer.actor.cpp | 2 +- fdbclient/BlobStore.actor.cpp | 37 ++++++++++++++++++----------- fdbclient/BlobStore.h | 4 ++-- flow/Arena.h | 9 ++++--- 4 files changed, 32 insertions(+), 20 deletions(-) diff --git a/fdbclient/BackupContainer.actor.cpp b/fdbclient/BackupContainer.actor.cpp index 95af3f422f..2dab4c7046 100644 --- a/fdbclient/BackupContainer.actor.cpp +++ b/fdbclient/BackupContainer.actor.cpp @@ -1360,7 +1360,7 @@ public: BlobStoreEndpoint::ListResult contents = wait(bstore->listBucket(bucket, basePath)); std::vector results; for(auto &f : contents.objects) { - results.push_back(bstore->getResourceURL(f.name.substr(basePath.size()))); + results.push_back(bstore->getResourceURL(f.name.substr(basePath.size()), format("bucket=%s", bucket.c_str()))); } return results; } diff --git a/fdbclient/BlobStore.actor.cpp b/fdbclient/BlobStore.actor.cpp index 1bcaad3737..8a0e7e672e 100644 --- a/fdbclient/BlobStore.actor.cpp +++ b/fdbclient/BlobStore.actor.cpp @@ -147,10 +147,11 @@ Reference BlobStoreEndpoint::fromString(std::string const &ur StringRef prefix = t.eat("://"); if(prefix != LiteralStringRef("blobstore")) throw format("Invalid blobstore URL prefix '%s'", prefix.toString().c_str()); - StringRef cred = t.eat("@"); - StringRef hostPort = t.eatAny("?/"); + StringRef cred = t.eat("@"); + uint8_t foundSeparator = 0; + StringRef hostPort = t.eatAny("/?", &foundSeparator); StringRef resource; - if(t[-1] == '/') { + if(foundSeparator == '/') { resource = t.eat("?"); } @@ -228,7 +229,7 @@ Reference BlobStoreEndpoint::fromString(std::string const &ur } } -std::string BlobStoreEndpoint::getResourceURL(std::string resource) { +std::string BlobStoreEndpoint::getResourceURL(std::string resource, std::string params) { std::string hostPort = host; if(!service.empty()) { hostPort.append(":"); @@ -241,20 +242,28 @@ std::string BlobStoreEndpoint::getResourceURL(std::string resource) { s = std::string(":") + secret; std::string r = format("blobstore://%s%s@%s/%s", key.c_str(), s.c_str(), hostPort.c_str(), resource.c_str()); - std::string p = knobs.getURLParameters(); - for(auto &kv : extraHeaders) { - if(!p.empty()) { - p.append("&"); + // Get params that are deviations from knob defaults + std::string knobParams = knobs.getURLParameters(); + if(!knobParams.empty()) { + if(!params.empty()) { + params.append("&"); } - p.append("header="); - p.append(HTTP::urlEncode(kv.first)); - p.append(":"); - p.append(HTTP::urlEncode(kv.second)); + params.append(knobParams); } - if(!p.empty()) - r.append("?").append(p); + for(auto &kv : extraHeaders) { + if(!params.empty()) { + params.append("&"); + } + params.append("header="); + params.append(HTTP::urlEncode(kv.first)); + params.append(":"); + params.append(HTTP::urlEncode(kv.second)); + } + + if(!params.empty()) + r.append("?").append(params); return r; } diff --git a/fdbclient/BlobStore.h b/fdbclient/BlobStore.h index 5b88c8f023..9d05ec39ad 100644 --- a/fdbclient/BlobStore.h +++ b/fdbclient/BlobStore.h @@ -133,8 +133,8 @@ public: // the unconsumed parameters will be added to it. static Reference fromString(std::string const &url, std::string *resourceFromURL = nullptr, std::string *error = nullptr, ParametersT *ignored_parameters = nullptr); - // Get a normalized version of this URL with the given resource and any non-default BlobKnob values as URL parameters. - std::string getResourceURL(std::string resource); + // Get a normalized version of this URL with the given resource and any non-default BlobKnob values as URL parameters in addition to the passed params string + std::string getResourceURL(std::string resource, std::string params); struct ReusableConnection { Reference conn; diff --git a/flow/Arena.h b/flow/Arena.h index b359f8c10b..4adc093a3c 100644 --- a/flow/Arena.h +++ b/flow/Arena.h @@ -502,9 +502,12 @@ public: // Return StringRef of bytes from begin() up to but not including the first byte matching any byte in sep, // and remove that sequence (including the sep byte) from *this // Returns and removes all bytes from *this if no bytes within sep were found - StringRef eatAny(StringRef sep) { + StringRef eatAny(StringRef sep, uint8_t *foundSeparator) { auto iSep = std::find_first_of(begin(), end(), sep.begin(), sep.end()); if(iSep != end()) { + if(foundSeparator != nullptr) { + *foundSeparator = *iSep; + } const int i = iSep - begin(); StringRef token = substr(0, i); *this = substr(i + 1); @@ -512,8 +515,8 @@ public: } return eat(); } - StringRef eatAny(const char *sep) { - return eatAny(StringRef((const uint8_t *)sep, strlen(sep))); + StringRef eatAny(const char *sep, uint8_t *foundSeparator) { + return eatAny(StringRef((const uint8_t *)sep, strlen(sep)), foundSeparator); } private: From 7f37e25c759c445c4163157b14cfd14e18fceeb9 Mon Sep 17 00:00:00 2001 From: Stephen Atherton Date: Tue, 5 Mar 2019 21:17:56 -0800 Subject: [PATCH 20/56] Fdbbackup binary will ignore .debug suffix when determining how it was invoked. --- fdbbackup/backup.actor.cpp | 3 +++ 1 file changed, 3 insertions(+) diff --git a/fdbbackup/backup.actor.cpp b/fdbbackup/backup.actor.cpp index 867e2aaba5..87948ba583 100644 --- a/fdbbackup/backup.actor.cpp +++ b/fdbbackup/backup.actor.cpp @@ -1010,6 +1010,9 @@ enumProgramExe getProgramType(std::string programExe) } } #endif + // For debugging convenience, remove .debug suffix if present. + if(StringRef(programExe).endsWith(LiteralStringRef(".debug"))) + programExe = programExe.substr(0, programExe.size() - 6); // Check if backup agent if ((programExe.length() >= exeAgent.size()) && From eed8dc816db6b5b18ab42da8cd5a51ba8f34c71a Mon Sep 17 00:00:00 2001 From: Stephen Atherton Date: Tue, 5 Mar 2019 22:17:37 -0800 Subject: [PATCH 21/56] Bug fix: fdbbackup modify on a tag that has never been used before would result in an internal error. --- fdbbackup/backup.actor.cpp | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/fdbbackup/backup.actor.cpp b/fdbbackup/backup.actor.cpp index 87948ba583..94527b2a7a 100644 --- a/fdbbackup/backup.actor.cpp +++ b/fdbbackup/backup.actor.cpp @@ -2120,6 +2120,12 @@ ACTOR Future modifyBackup(Database db, std::string tagName, BackupModifyOp tr->setOption(FDBTransactionOptions::LOCK_AWARE); state Optional uidFlag = wait(tag.get(db)); + + if(!uidFlag.present()) { + fprintf(stderr, "No backup exists on tag '%s'\n", tagName.c_str()); + throw backup_error(); + } + if(uidFlag.get().second) { fprintf(stderr, "Cannot modify aborted backup on tag '%s'\n", tagName.c_str()); throw backup_error(); From 2fbc7522e4c75168b156eccdc4f5e75bc99ed237 Mon Sep 17 00:00:00 2001 From: Ryan Worl Date: Wed, 6 Mar 2019 18:25:02 -0500 Subject: [PATCH 22/56] review feedback: fix bindingtester test, add comments to versionstamp and other structures, handle nested tuples, handle prefix []byte in PackWithVersionstamp --- bindings/go/src/_stacktester/stacktester.go | 17 +++-- bindings/go/src/fdb/tuple/tuple.go | 84 ++++++++++++++++----- 2 files changed, 74 insertions(+), 27 deletions(-) diff --git a/bindings/go/src/_stacktester/stacktester.go b/bindings/go/src/_stacktester/stacktester.go index b2012546e7..0c925af503 100644 --- a/bindings/go/src/_stacktester/stacktester.go +++ b/bindings/go/src/_stacktester/stacktester.go @@ -105,7 +105,7 @@ func (sm *StackMachine) waitAndPop() (ret stackEntry) { switch el := ret.item.(type) { case []byte: ret.item = el - case int64, uint64, *big.Int, string, bool, tuple.UUID, float32, float64, tuple.Tuple: + case int64, uint64, *big.Int, string, bool, tuple.UUID, float32, float64, tuple.Tuple, tuple.Versionstamp: ret.item = el case fdb.Key: ret.item = []byte(el) @@ -662,20 +662,21 @@ func (sm *StackMachine) processInst(idx int, inst tuple.Tuple) { t = append(t, sm.waitAndPop().item) } sm.store(idx, []byte(t.Pack())) - case op == "TUPLE_PACK_VERSIONSTAMP": + case op == "TUPLE_PACK_WITH_VERSIONSTAMP": var t tuple.Tuple - count := sm.waitAndPop().item.(int64) - for i := 0; i < int(count); i++ { + + prefix := sm.waitAndPop().item.([]byte) + c := sm.waitAndPop().item.(int64) + for i := 0; i < int(c); i++ { t = append(t, sm.waitAndPop().item) } - incomplete, err := t.HasIncompleteVersionstamp() - if incomplete == false { + packed, err := t.PackWithVersionstamp(prefix) + if err != nil && strings.Contains(err.Error(), "No incomplete") { sm.store(idx, []byte("ERROR: NONE")) } else if err != nil { sm.store(idx, []byte("ERROR: MULTIPLE")) } else { - packed := t.Pack() sm.store(idx, []byte("OK")) sm.store(idx, packed) } @@ -911,7 +912,7 @@ func main() { log.Fatal("API version not equal to value selected") } - db, e = fdb.OpenDatabase(clusterFile) + db, e = fdb.Open(clusterFile, []byte("DB")) if e != nil { log.Fatal(e) } diff --git a/bindings/go/src/fdb/tuple/tuple.go b/bindings/go/src/fdb/tuple/tuple.go index 2c30705ba0..d3f289e2eb 100644 --- a/bindings/go/src/fdb/tuple/tuple.go +++ b/bindings/go/src/fdb/tuple/tuple.go @@ -73,7 +73,10 @@ type Tuple []TupleElement // an instance of this type. type UUID [16]byte -// Versionstamp . +// Versionstamp is struct for a FoundationDB verionstamp. Versionstamps are +// 12 bytes long composed of a 10 byte transaction version and a 2 byte user +// version. The transaction version is filled in at commit time and the user +// version is provided by your layer during a transaction. type Versionstamp struct { TransactionVersion [10]byte UserVersion uint16 @@ -83,7 +86,8 @@ var incompleteTransactionVersion = [10]byte{0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, const versionstampLength = 12 -// IncompleteVersionstamp . +// IncompleteVersionstamp is the constructor you should use to make +// an incomplete versionstamp to use in a tuple. func IncompleteVersionstamp(userVersion uint16) Versionstamp { return Versionstamp{ TransactionVersion: incompleteTransactionVersion, @@ -91,16 +95,14 @@ func IncompleteVersionstamp(userVersion uint16) Versionstamp { } } -// Bytes . +// Bytes converts a Versionstamp struct to a byte slice for encoding in a tuple. func (v Versionstamp) Bytes() []byte { - var scratch [12]byte + var scratch [versionstampLength]byte copy(scratch[:], v.TransactionVersion[:]) binary.BigEndian.PutUint16(scratch[10:], v.UserVersion) - fmt.Println(scratch) - return scratch[:] } @@ -293,8 +295,8 @@ func (p *packer) encodeUUID(u UUID) { func (p *packer) encodeVersionstamp(v Versionstamp) { p.putByte(versionstampCode) - if p.versionstampPos != 0 && v.TransactionVersion == incompleteTransactionVersion { - panic(fmt.Sprintf("Tuple can only contain one unbound versionstamp")) + if p.versionstampPos != -1 && v.TransactionVersion == incompleteTransactionVersion { + panic(fmt.Sprintf("Tuple can only contain one incomplete versionstamp")) } else { p.versionstampPos = int32(len(p.buf)) } @@ -368,49 +370,93 @@ func (p *packer) encodeTuple(t Tuple, nested bool) { // Tuple satisfies the fdb.KeyConvertible interface, so it is not necessary to // call Pack when using a Tuple with a FoundationDB API function that requires a // key. +// +// This method will panic if it contains an incomplete Versionstamp. Use +// PackWithVersionstamp instead. +// func (t Tuple) Pack() []byte { p := newPacker() p.encodeTuple(t, false) return p.buf } -// PackWithVersionstamp packs the specified tuple into a key for versionstamp operations -func (t Tuple) PackWithVersionstamp() ([]byte, error) { +// PackWithVersionstamp packs the specified tuple into a key for versionstamp +// operations. See Pack for more information. This function will return an error +// if you attempt to pack a tuple with more than one versionstamp. This function will +// return an error if you attempt to pack a tuple with a versionstamp position larger +// than an uint16 on apiVersion < 520. +func (t Tuple) PackWithVersionstamp(prefix []byte) ([]byte, error) { hasVersionstamp, err := t.HasIncompleteVersionstamp() if err != nil { return nil, err } + if hasVersionstamp == false { + return nil, errors.New("No incomplete versionstamp included in tuple pack with versionstamp") + } + p := newPacker() + + prefixLength := int32(0) + if prefix != nil { + prefixLength = int32(len(prefix)) + p.putBytes(prefix) + } + p.encodeTuple(t, false) if hasVersionstamp { var scratch [4]byte - binary.LittleEndian.PutUint32(scratch[:], uint32(p.versionstampPos)) - p.putBytes(scratch[:]) + var offsetIndex int + + apiVersion := fdb.MustGetAPIVersion() + if apiVersion < 520 { + if p.versionstampPos > math.MaxUint16 { + return nil, errors.New("Versionstamp position too large") + } + + offsetIndex = 1 + binary.LittleEndian.PutUint16(scratch[:], uint16(prefixLength+p.versionstampPos)) + } else { + offsetIndex = 3 + binary.LittleEndian.PutUint32(scratch[:], uint32(prefixLength+p.versionstampPos)) + } + + p.putBytes(scratch[0:offsetIndex]) } return p.buf, nil } -// HasIncompleteVersionstamp determines if there is at least one incomplete versionstamp in a tuple +// HasIncompleteVersionstamp determines if there is at least one incomplete +// versionstamp in a tuple. This function will return an error this tuple has +// more than one versionstamp. func (t Tuple) HasIncompleteVersionstamp() (bool, error) { + incompleteCount := t.countIncompleteVersionstamps() + + var err error + if incompleteCount > 1 { + err = errors.New("Tuple can only contain one incomplete versionstamp") + } + + return incompleteCount == 1, err +} + +func (t Tuple) countIncompleteVersionstamps() int { incompleteCount := 0 + for _, el := range t { switch e := el.(type) { case Versionstamp: if e.TransactionVersion == incompleteTransactionVersion { incompleteCount++ } + case Tuple: + incompleteCount += e.countIncompleteVersionstamps() } } - var err error - if incompleteCount > 1 { - err = errors.New("Tuple can only contain one unbound versionstamp") - } - - return incompleteCount == 1, err + return incompleteCount } func findTerminator(b []byte) int { From 77f7c0721f2ad93f5d4f03a7da49220987fc3c84 Mon Sep 17 00:00:00 2001 From: Ryan Worl Date: Wed, 6 Mar 2019 18:27:43 -0500 Subject: [PATCH 23/56] revent this again because my environment is dumb --- bindings/go/src/_stacktester/stacktester.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/bindings/go/src/_stacktester/stacktester.go b/bindings/go/src/_stacktester/stacktester.go index 0c925af503..d2dc1bcbe3 100644 --- a/bindings/go/src/_stacktester/stacktester.go +++ b/bindings/go/src/_stacktester/stacktester.go @@ -912,7 +912,7 @@ func main() { log.Fatal("API version not equal to value selected") } - db, e = fdb.Open(clusterFile, []byte("DB")) + db, e = fdb.OpenDatabase(clusterFile) if e != nil { log.Fatal(e) } From ed49d603a002a0b70534d082050b10238a7073fd Mon Sep 17 00:00:00 2001 From: Vishesh Yadav Date: Wed, 6 Mar 2019 14:21:30 -0800 Subject: [PATCH 24/56] Allows cluster string to contain coordinators with different TLS states During live TLS upgrades, we can hence switch one coordinator at a time to TLS than all of them together. --- fdbcli/fdbcli.actor.cpp | 4 ---- fdbclient/MonitorLeader.actor.cpp | 6 ------ fdbserver/fdbserver.actor.cpp | 7 ------- 3 files changed, 17 deletions(-) diff --git a/fdbcli/fdbcli.actor.cpp b/fdbcli/fdbcli.actor.cpp index 3cb37d6e06..9f7a0c851c 100644 --- a/fdbcli/fdbcli.actor.cpp +++ b/fdbcli/fdbcli.actor.cpp @@ -1817,10 +1817,6 @@ ACTOR Future coordinators( Database db, std::vector tokens, boo try { // SOMEDAY: Check for keywords auto const& addr = NetworkAddress::parse( t->toString() ); - if (addresses.size() > 0 && addr.isTLS() != addresses.begin()->isTLS()) { - printf("ERROR: cannot use coordinators with different TLS states: `%s'\n", t->toString().c_str()); - return true; - } if (addresses.count(addr)){ printf("ERROR: passed redundant coordinators: `%s'\n", addr.toString().c_str()); return true; diff --git a/fdbclient/MonitorLeader.actor.cpp b/fdbclient/MonitorLeader.actor.cpp index c07ed5ef73..9aea781e5e 100644 --- a/fdbclient/MonitorLeader.actor.cpp +++ b/fdbclient/MonitorLeader.actor.cpp @@ -176,12 +176,6 @@ ClusterConnectionString::ClusterConnectionString( std::string const& connectionS coord = NetworkAddress::parseList(addrs); ASSERT( coord.size() > 0 ); // parseList() always returns at least one address if it doesn't throw - bool isTLS = coord[0].isTLS(); - for( auto const& server : coord ) { - if( server.isTLS() != isTLS ) - throw connection_string_invalid(); - } - std::sort( coord.begin(), coord.end() ); // Check that there are no duplicate addresses if ( std::unique( coord.begin(), coord.end() ) != coord.end() ) diff --git a/fdbserver/fdbserver.actor.cpp b/fdbserver/fdbserver.actor.cpp index f208eb92d8..c1dbb09e26 100644 --- a/fdbserver/fdbserver.actor.cpp +++ b/fdbserver/fdbserver.actor.cpp @@ -759,13 +759,6 @@ std::pair buildNetworkAddresses(const Cl const NetworkAddressList& coordinators = connectionFile.getConnectionString().coordinators(); ASSERT(coordinators.size() > 0); - bool clusterIsTLS = coordinators[0].isTLS(); - for (int ii = 1; ii < coordinators.size(); ++ii) { - if (coordinators[ii].isTLS() != clusterIsTLS) { - fprintf(stderr, "ERROR: coordinators cannot have mixed TLS state.\n"); - flushAndExit(FDB_EXIT_ERROR); - } - } int numTLSAddress = 0; From 23f86132f5c6876aca024617de89772e4a9f6309 Mon Sep 17 00:00:00 2001 From: Alec Grieser Date: Wed, 6 Mar 2019 18:34:36 -0800 Subject: [PATCH 25/56] Resolves #1235: Java: FDBExceptions are created successful operation completion (#1236) The native function `NativeFuture::Future_getError` now returns `null` when the error code is 0 instead of an `FDBException` with a "Success" message and an error code of 0. This was only used in two places within the codebase; those two places now check for `null` errors and treats them like successes. --- bindings/java/fdbJNI.cpp | 6 +++++- .../java/src/main/com/apple/foundationdb/FutureResults.java | 2 +- .../java/src/main/com/apple/foundationdb/FutureVoid.java | 2 +- documentation/sphinx/source/release-notes.rst | 1 + 4 files changed, 8 insertions(+), 3 deletions(-) diff --git a/bindings/java/fdbJNI.cpp b/bindings/java/fdbJNI.cpp index 0e41b8d2bc..f2de8fa93f 100644 --- a/bindings/java/fdbJNI.cpp +++ b/bindings/java/fdbJNI.cpp @@ -206,7 +206,11 @@ JNIEXPORT jthrowable JNICALL Java_com_apple_foundationdb_NativeFuture_Future_1ge return JNI_NULL; } FDBFuture *sav = (FDBFuture *)future; - return getThrowable( jenv, fdb_future_get_error( sav ) ); + fdb_error_t err = fdb_future_get_error( sav ); + if( err ) + return getThrowable( jenv, err ); + else + return JNI_NULL; } JNIEXPORT jboolean JNICALL Java_com_apple_foundationdb_NativeFuture_Future_1isReady(JNIEnv *jenv, jobject, jlong future) { diff --git a/bindings/java/src/main/com/apple/foundationdb/FutureResults.java b/bindings/java/src/main/com/apple/foundationdb/FutureResults.java index a710fca8a4..4900d45dc2 100644 --- a/bindings/java/src/main/com/apple/foundationdb/FutureResults.java +++ b/bindings/java/src/main/com/apple/foundationdb/FutureResults.java @@ -37,7 +37,7 @@ class FutureResults extends NativeFuture { protected RangeResultInfo getIfDone_internal(long cPtr) throws FDBException { FDBException err = Future_getError(cPtr); - if(!err.isSuccess()) { + if(err != null && !err.isSuccess()) { throw err; } diff --git a/bindings/java/src/main/com/apple/foundationdb/FutureVoid.java b/bindings/java/src/main/com/apple/foundationdb/FutureVoid.java index c5f7b6eb36..86ba565ea9 100644 --- a/bindings/java/src/main/com/apple/foundationdb/FutureVoid.java +++ b/bindings/java/src/main/com/apple/foundationdb/FutureVoid.java @@ -34,7 +34,7 @@ class FutureVoid extends NativeFuture { // with a get on the error and throw if the error is not success. FDBException err = Future_getError(cPtr); - if(!err.isSuccess()) { + if(err != null && !err.isSuccess()) { throw err; } return null; diff --git a/documentation/sphinx/source/release-notes.rst b/documentation/sphinx/source/release-notes.rst index 51345e3151..695d2792c7 100644 --- a/documentation/sphinx/source/release-notes.rst +++ b/documentation/sphinx/source/release-notes.rst @@ -12,6 +12,7 @@ Fixes * The ``include`` command in fdbcli would falsly include all machines with IP addresses that have the included IP address as a prefix (for example ``include 1.0.0.1`` would also include ``1.0.0.10``). `(PR #1121) `_ +* Java: Succesful commits and range reads no longer create ``FDBException`` objects to reduce memory pressure. `(Issue #1235) `_ 6.0.18 ====== From 3c86643822f23164198828456218a7d1fdc8da8d Mon Sep 17 00:00:00 2001 From: Jingyu Zhou Date: Thu, 14 Feb 2019 16:24:46 -0800 Subject: [PATCH 26/56] Separate Ratekeeper from data distribution. Add a new role for ratekeeper. Remove StorageServerChanges from data distribution. Ratekeeper monitors storage servers, which borrows the idea from DataDistribution. --- fdbrpc/Locality.cpp | 23 +++++ fdbrpc/Locality.h | 7 +- fdbrpc/simulator.h | 1 + fdbserver/CMakeLists.txt | 1 + fdbserver/ClusterController.actor.cpp | 92 ++++++++++++++++++-- fdbserver/ClusterRecruitmentInterface.h | 7 +- fdbserver/DataDistribution.actor.cpp | 77 +++++------------ fdbserver/DataDistribution.actor.h | 1 + fdbserver/DataDistributorInterface.h | 36 +------- fdbserver/Knobs.cpp | 2 + fdbserver/Knobs.h | 2 + fdbserver/MasterProxyServer.actor.cpp | 17 ++-- fdbserver/Ratekeeper.actor.cpp | 110 ++++++++++++++++++------ fdbserver/Ratekeeper.h | 35 -------- fdbserver/RatekeeperInterface.h | 93 ++++++++++++++++++++ fdbserver/ServerDBInfo.h | 2 + fdbserver/WorkerInterface.actor.h | 16 +++- fdbserver/fdbserver.vcxproj | 2 +- fdbserver/fdbserver.vcxproj.filters | 3 +- fdbserver/masterserver.actor.cpp | 1 - fdbserver/worker.actor.cpp | 26 +++++- flow/network.h | 1 + 22 files changed, 377 insertions(+), 178 deletions(-) delete mode 100644 fdbserver/Ratekeeper.h create mode 100644 fdbserver/RatekeeperInterface.h diff --git a/fdbrpc/Locality.cpp b/fdbrpc/Locality.cpp index d1f1957d2a..ff9135d77e 100644 --- a/fdbrpc/Locality.cpp +++ b/fdbrpc/Locality.cpp @@ -185,6 +185,29 @@ ProcessClass::Fitness ProcessClass::machineClassFitness( ClusterRole role ) cons default: return ProcessClass::WorstFit; } + case ProcessClass::RateKeeper: + switch( _class ) { + case ProcessClass::RateKeeperClass: + return ProcessClass::BestFit; + case ProcessClass::StatelessClass: + return ProcessClass::GoodFit; + case ProcessClass::MasterClass: + return ProcessClass::OkayFit; + case ProcessClass::ResolutionClass: + return ProcessClass::OkayFit; + case ProcessClass::TransactionClass: + return ProcessClass::OkayFit; + case ProcessClass::ProxyClass: + return ProcessClass::OkayFit; + case ProcessClass::UnsetClass: + return ProcessClass::UnsetFit; + case ProcessClass::CoordinatorClass: + return ProcessClass::NeverAssign; + case ProcessClass::TesterClass: + return ProcessClass::NeverAssign; + default: + return ProcessClass::WorstFit; + } default: return ProcessClass::NeverAssign; } diff --git a/fdbrpc/Locality.h b/fdbrpc/Locality.h index bae2fd69e8..1415ad9eff 100644 --- a/fdbrpc/Locality.h +++ b/fdbrpc/Locality.h @@ -26,9 +26,9 @@ struct ProcessClass { // This enum is stored in restartInfo.ini for upgrade tests, so be very careful about changing the existing items! - enum ClassType { UnsetClass, StorageClass, TransactionClass, ResolutionClass, TesterClass, ProxyClass, MasterClass, StatelessClass, LogClass, ClusterControllerClass, LogRouterClass, DataDistributorClass, CoordinatorClass, InvalidClass = -1 }; + enum ClassType { UnsetClass, StorageClass, TransactionClass, ResolutionClass, TesterClass, ProxyClass, MasterClass, StatelessClass, LogClass, ClusterControllerClass, LogRouterClass, DataDistributorClass, CoordinatorClass, RateKeeperClass, InvalidClass = -1 }; enum Fitness { BestFit, GoodFit, UnsetFit, OkayFit, WorstFit, ExcludeFit, NeverAssign }; //cannot be larger than 7 because of leader election mask - enum ClusterRole { Storage, TLog, Proxy, Master, Resolver, LogRouter, ClusterController, DataDistributor, NoRole }; + enum ClusterRole { Storage, TLog, Proxy, Master, Resolver, LogRouter, ClusterController, DataDistributor, RateKeeper, NoRole }; enum ClassSource { CommandLineSource, AutoSource, DBSource, InvalidSource = -1 }; int16_t _class; int16_t _source; @@ -50,6 +50,7 @@ public: else if (s=="cluster_controller") _class = ClusterControllerClass; else if (s=="data_distributor") _class = DataDistributorClass; else if (s=="coordinator") _class = CoordinatorClass; + else if (s=="ratekeeper") _class = RateKeeperClass; else _class = InvalidClass; } @@ -67,6 +68,7 @@ public: else if (classStr=="cluster_controller") _class = ClusterControllerClass; else if (classStr=="data_distributor") _class = DataDistributorClass; else if (classStr=="coordinator") _class = CoordinatorClass; + else if (classStr=="ratekeeper") _class = RateKeeperClass; else _class = InvalidClass; if (sourceStr=="command_line") _source = CommandLineSource; @@ -99,6 +101,7 @@ public: case ClusterControllerClass: return "cluster_controller"; case DataDistributorClass: return "data_distributor"; case CoordinatorClass: return "coordinator"; + case RateKeeperClass: return "ratekeeper"; default: return "invalid"; } } diff --git a/fdbrpc/simulator.h b/fdbrpc/simulator.h index 2bfd34e98f..2987c80655 100644 --- a/fdbrpc/simulator.h +++ b/fdbrpc/simulator.h @@ -99,6 +99,7 @@ public: case ProcessClass::LogRouterClass: return false; case ProcessClass::ClusterControllerClass: return false; case ProcessClass::DataDistributorClass: return false; + case ProcessClass::RateKeeperClass: return false; default: return false; } } diff --git a/fdbserver/CMakeLists.txt b/fdbserver/CMakeLists.txt index 9e630d4c7c..58853a2fee 100644 --- a/fdbserver/CMakeLists.txt +++ b/fdbserver/CMakeLists.txt @@ -57,6 +57,7 @@ set(FDBSERVER_SRCS QuietDatabase.h Ratekeeper.actor.cpp Ratekeeper.h + RatekeeperInterface.h RecoveryState.h Restore.actor.cpp RestoreInterface.h diff --git a/fdbserver/ClusterController.actor.cpp b/fdbserver/ClusterController.actor.cpp index aadebd93c2..d95b9c1385 100644 --- a/fdbserver/ClusterController.actor.cpp +++ b/fdbserver/ClusterController.actor.cpp @@ -30,6 +30,7 @@ #include "fdbserver/LogSystemConfig.h" #include "fdbserver/WaitFailure.h" #include "fdbserver/ClusterRecruitmentInterface.h" +#include "fdbserver/RatekeeperInterface.h" #include "fdbserver/ServerDBInfo.h" #include "fdbserver/Status.h" #include "fdbserver/LatencyBandConfig.h" @@ -110,17 +111,28 @@ public: { } - void setDistributor(const DataDistributorInterface& distributorInterf) { + void setDistributor(const DataDistributorInterface& interf) { ServerDBInfo newInfo = serverInfo->get(); newInfo.id = g_random->randomUniqueID(); - newInfo.distributor = distributorInterf; + newInfo.distributor = interf; serverInfo->set( newInfo ); } - void clearDistributor() { + void setRatekeeper(const RatekeeperInterface& interf) { ServerDBInfo newInfo = serverInfo->get(); newInfo.id = g_random->randomUniqueID(); - newInfo.distributor = Optional(); + newInfo.ratekeeper = interf; + serverInfo->set( newInfo ); + } + + void clearInterf(ProcessClass::ClassType t) { + ServerDBInfo newInfo = serverInfo->get(); + newInfo.id = g_random->randomUniqueID(); + if (t == ProcessClass::DataDistributorClass) { + newInfo.distributor = Optional(); + } else if (t == ProcessClass::RateKeeperClass) { + newInfo.ratekeeper = Optional(); + } serverInfo->set( newInfo ); } }; @@ -524,6 +536,9 @@ public: if (db.serverInfo->get().distributor.present()) { (*id_used)[db.serverInfo->get().distributor.get().locality.processId()]++; } + if (db.serverInfo->get().ratekeeper.present()) { + (*id_used)[db.serverInfo->get().ratekeeper.get().locality.processId()]++; + } } RecruitRemoteFromConfigurationReply findRemoteWorkersForConfiguration( RecruitRemoteFromConfigurationRequest const& req ) { @@ -1752,7 +1767,7 @@ void registerWorker( RegisterWorkerRequest req, ClusterControllerData *self ) { if ( req.distributorInterf.present() && !self->db.serverInfo->get().distributor.present() ) { const DataDistributorInterface& di = req.distributorInterf.get(); TraceEvent("ClusterController_RegisterDataDistributor", self->id).detail("DDID", di.id()); - self->db.setDistributor( di ); + self->db.setDistributor(di); } if( info == self->id_worker.end() ) { self->id_worker[w.locality.processId()] = WorkerInfo( workerAvailabilityWatch( w, newProcessClass, self ), req.reply, req.generation, w, req.initialClass, newProcessClass, newPriorityInfo ); @@ -2341,7 +2356,7 @@ ACTOR Future startDataDistributor( ClusterControllerDa } } -ACTOR Future waitDDRejoinOrStartDD( ClusterControllerData *self, ClusterControllerFullInterface *clusterInterface ) { +ACTOR Future waitDDRejoinOrStartDD(ClusterControllerData *self) { state Future initialDelay = delay(SERVER_KNOBS->WAIT_FOR_DISTRIBUTOR_JOIN_DELAY); // wait for a while to see if existing data distributor will join. @@ -2361,10 +2376,68 @@ ACTOR Future waitDDRejoinOrStartDD( ClusterControllerData *self, ClusterCo wait( waitFailureClient( self->db.serverInfo->get().distributor.get().waitFailure, SERVER_KNOBS->DD_FAILURE_TIME ) ); TraceEvent("ClusterController", self->id) .detail("DataDistributorDied", self->db.serverInfo->get().distributor.get().id()); - self->db.clearDistributor(); + self->db.clearInterf(ProcessClass::DataDistributorClass); } else { DataDistributorInterface distributorInterf = wait( startDataDistributor(self) ); - self->db.setDistributor( distributorInterf ); + self->db.setDistributor(distributorInterf); + } + } +} + +ACTOR Future startRatekeeper(ClusterControllerData *self) { + loop { + try { + while ( self->db.serverInfo->get().recoveryState < RecoveryState::ACCEPTING_COMMITS ) { + wait( self->db.serverInfo->onChange() ); + } + + std::map>, int> id_used = self->getUsedIds(); + Optional dcId = self->clusterControllerDcId; + state WorkerFitnessInfo rkWorker = self->getWorkerForRoleInDatacenter(dcId, ProcessClass::RateKeeper, ProcessClass::NeverAssign, self->db.config, id_used); + state InitializeRatekeeperRequest req; + req.reqId = g_random->randomUniqueID(); + TraceEvent("ClusterController_RecruitRatekeeper", req.reqId).detail("Addr", rkWorker.worker.first.address()); + + ErrorOr interf = wait( rkWorker.worker.first.ratekeeper.getReplyUnlessFailedFor(req, SERVER_KNOBS->WAIT_FOR_DISTRIBUTOR_JOIN_DELAY, 0) ); + if (interf.present()) { + TraceEvent("ClusterController_RatekeeperRecruited", req.reqId).detail("Addr", rkWorker.worker.first.address()); + return interf.get(); + } + } + catch (Error& e) { + TraceEvent("ClusterController_RatekeeperRecruitError", req.reqId).error(e); + if ( e.code() != error_code_no_more_servers ) { + throw; + } + } + wait( delay(SERVER_KNOBS->ATTEMPT_RECRUITMENT_DELAY) ); + } +} + +ACTOR Future waitRKRejoinOrStartRK(ClusterControllerData *self) { + state Future initialDelay = delay(SERVER_KNOBS->WAIT_FOR_RATEKEEPER_JOIN_DELAY); + + // wait for a while to see if an existing ratekeeper will join. + loop choose { + when ( wait(initialDelay) ) { break; } + when ( wait(self->db.serverInfo->onChange()) ) { // Rejoins via worker registration + if ( self->db.serverInfo->get().ratekeeper.present() ) { + TraceEvent("ClusterController_GotRateKeeper", self->id) + .detail("RKID", self->db.serverInfo->get().ratekeeper.get().id()); + break; + } + } + } + + loop { + if ( self->db.serverInfo->get().ratekeeper.present() ) { + wait( waitFailureClient( self->db.serverInfo->get().ratekeeper.get().waitFailure, SERVER_KNOBS->RATEKEEPER_FAILURE_TIME ) ); + TraceEvent("ClusterController", self->id) + .detail("RatekeeperDied", self->db.serverInfo->get().ratekeeper.get().id()); + self->db.clearInterf(ProcessClass::RateKeeperClass); + } else { + RatekeeperInterface rkInterf = wait( startRatekeeper(self) ); + self->db.setRatekeeper(rkInterf); } } } @@ -2385,8 +2458,9 @@ ACTOR Future clusterControllerCore( ClusterControllerFullInterface interf, self.addActor.send( updatedChangingDatacenters(&self) ); self.addActor.send( updatedChangedDatacenters(&self) ); self.addActor.send( updateDatacenterVersionDifference(&self) ); - self.addActor.send( waitDDRejoinOrStartDD(&self, &interf) ); self.addActor.send( handleForcedRecoveries(&self, interf) ); + self.addActor.send( waitDDRejoinOrStartDD(&self) ); + self.addActor.send( waitRKRejoinOrStartRK(&self) ); //printf("%s: I am the cluster controller\n", g_network->getLocalAddress().toString().c_str()); loop choose { diff --git a/fdbserver/ClusterRecruitmentInterface.h b/fdbserver/ClusterRecruitmentInterface.h index aefbe167c8..db49a9d075 100644 --- a/fdbserver/ClusterRecruitmentInterface.h +++ b/fdbserver/ClusterRecruitmentInterface.h @@ -168,15 +168,16 @@ struct RegisterWorkerRequest { ClusterControllerPriorityInfo priorityInfo; Generation generation; Optional distributorInterf; + Optional ratekeeperInterf; ReplyPromise reply; RegisterWorkerRequest() : priorityInfo(ProcessClass::UnsetFit, false, ClusterControllerPriorityInfo::FitnessUnknown) {} - RegisterWorkerRequest(WorkerInterface wi, ProcessClass initialClass, ProcessClass processClass, ClusterControllerPriorityInfo priorityInfo, Generation generation, Optional ddInterf) : - wi(wi), initialClass(initialClass), processClass(processClass), priorityInfo(priorityInfo), generation(generation), distributorInterf(ddInterf) {} + RegisterWorkerRequest(WorkerInterface wi, ProcessClass initialClass, ProcessClass processClass, ClusterControllerPriorityInfo priorityInfo, Generation generation, Optional ddInterf, Optional rkInterf) : + wi(wi), initialClass(initialClass), processClass(processClass), priorityInfo(priorityInfo), generation(generation), distributorInterf(ddInterf), ratekeeperInterf(rkInterf) {} template void serialize( Ar& ar ) { - serializer(ar, wi, initialClass, processClass, priorityInfo, generation, distributorInterf, reply); + serializer(ar, wi, initialClass, processClass, priorityInfo, generation, distributorInterf, ratekeeperInterf, reply); } }; diff --git a/fdbserver/DataDistribution.actor.cpp b/fdbserver/DataDistribution.actor.cpp index 48f5732fca..fe52e5e42a 100644 --- a/fdbserver/DataDistribution.actor.cpp +++ b/fdbserver/DataDistribution.actor.cpp @@ -29,7 +29,6 @@ #include "fdbserver/WaitFailure.h" #include "fdbserver/ServerDBInfo.h" #include "fdbserver/IKeyValueStore.h" -#include "fdbserver/Ratekeeper.h" #include "fdbclient/ManagementAPI.actor.h" #include "fdbrpc/Replication.h" #include "flow/UnitTest.h" @@ -570,7 +569,6 @@ struct DDTeamCollection : ReferenceCounted { PromiseStream removedServers; std::set recruitingIds; // The IDs of the SS which are being recruited std::set recruitingLocalities; - Optional> >> serverChanges; Future initialFailureReactionDelay; Future initializationDoneActor; Promise serverTrackerErrorOut; @@ -629,13 +627,12 @@ struct DDTeamCollection : ReferenceCounted { Reference const& shardsAffectedByTeamFailure, DatabaseConfiguration configuration, std::vector> includedDCs, Optional>> otherTrackedDCs, - Optional>>> const& serverChanges, Future readyToStart, Reference> zeroHealthyTeams, bool primary, Reference> processingUnhealthy) : cx(cx), distributorId(distributorId), lock(lock), output(output), shardsAffectedByTeamFailure(shardsAffectedByTeamFailure), doBuildTeams(true), teamBuilder(Void()), badTeamRemover(Void()), redundantTeamRemover(Void()), configuration(configuration), - serverChanges(serverChanges), readyToStart(readyToStart), + readyToStart(readyToStart), checkTeamDelay(delay(SERVER_KNOBS->CHECK_TEAM_DELAY, TaskDataDistribution)), initialFailureReactionDelay( delayed(readyToStart, SERVER_KNOBS->INITIAL_FAILURE_REACTION_DELAY, TaskDataDistribution)), @@ -2839,10 +2836,6 @@ ACTOR Future storageServerTracker( state Future storeTracker = keyValueStoreTypeTracker( self, server ); state bool hasWrongStoreTypeOrDC = false; - if(self->serverChanges.present()) { - self->serverChanges.get().send( std::make_pair(server->id, server->lastKnownInterface) ); - } - try { loop { status.isUndesired = false; @@ -2933,9 +2926,6 @@ ACTOR Future storageServerTracker( when( wait( failureTracker ) ) { // The server is failed AND all data has been removed from it, so permanently remove it. TraceEvent("StatusMapChange", self->distributorId).detail("ServerID", server->id).detail("Status", "Removing"); - if(self->serverChanges.present()) { - self->serverChanges.get().send( std::make_pair(server->id, Optional()) ); - } if(server->updated.canBeSet()) { server->updated.send(Void()); @@ -3040,9 +3030,6 @@ ACTOR Future storageServerTracker( } interfaceChanged = server->onInterfaceChanged; - if(self->serverChanges.present()) { - self->serverChanges.get().send( std::make_pair(server->id, server->lastKnownInterface) ); - } // We rely on the old failureTracker being actorCancelled since the old actor now has a pointer to an invalid location status = ServerStatus( status.isFailed, status.isUndesired, server->lastKnownInterface.locality ); @@ -3460,13 +3447,11 @@ ACTOR Future pollMoveKeysLock( Database cx, MoveKeysLock lock ) { } } -ACTOR Future dataDistribution( - Reference> db, - UID myId, - PromiseStream< std::pair> > serverChanges, - double* lastLimited) +ACTOR Future dataDistribution(Reference self, + double* lastLimited) { - state Database cx = openDBOnServer(db, TaskDataDistributionLaunch, true, true); + state Database cx = openDBOnServer(self->dbInfo, TaskDataDistributionLaunch, true, true); + state DatabaseConfiguration configuration = self->configuration->get(); cx->locationCacheSize = SERVER_KNOBS->DD_LOCATION_CACHE_SIZE; //cx->setOption( FDBDatabaseOptions::LOCATION_CACHE_SIZE, StringRef((uint8_t*) &SERVER_KNOBS->DD_LOCATION_CACHE_SIZE, 8) ); @@ -3532,20 +3517,20 @@ ACTOR Future dataDistribution( Reference initData_ = wait( getInitialDataDistribution(cx, myId, lock, configuration.usableRegions > 1 ? remoteDcIds : std::vector>() ) ); initData = initData_; if(initData->shards.size() > 1) { - TraceEvent("DDInitGotInitialDD", myId) + TraceEvent("DDInitGotInitialDD", self->ddId) .detail("B", printable(initData->shards.end()[-2].key)) .detail("E", printable(initData->shards.end()[-1].key)) .detail("Src", describe(initData->shards.end()[-2].primarySrc)) .detail("Dest", describe(initData->shards.end()[-2].primaryDest)) .trackLatest("InitialDD"); } else { - TraceEvent("DDInitGotInitialDD", myId).detail("B","").detail("E", "").detail("Src", "[no items]").detail("Dest", "[no items]").trackLatest("InitialDD"); + TraceEvent("DDInitGotInitialDD", self->ddId).detail("B","").detail("E", "").detail("Src", "[no items]").detail("Dest", "[no items]").trackLatest("InitialDD"); } if (initData->mode) break; // mode may be set true by system operator using fdbcli - TraceEvent("DataDistributionDisabled", myId); + TraceEvent("DataDistributionDisabled", self->ddId); - TraceEvent("MovingData", myId) + TraceEvent("MovingData", self->ddId) .detail( "InFlight", 0 ) .detail( "InQueue", 0 ) .detail( "AverageShardSize", -1 ) @@ -3554,8 +3539,8 @@ ACTOR Future dataDistribution( .detail( "HighestPriority", 0 ) .trackLatest( "MovingData" ); - TraceEvent("TotalDataInFlight", myId).detail("Primary", true).detail("TotalBytes", 0).detail("UnhealthyServers", 0).detail("HighestPriority", 0).trackLatest("TotalDataInFlight"); - TraceEvent("TotalDataInFlight", myId).detail("Primary", false).detail("TotalBytes", 0).detail("UnhealthyServers", 0).detail("HighestPriority", configuration.usableRegions > 1 ? 0 : -1).trackLatest("TotalDataInFlightRemote"); + TraceEvent("TotalDataInFlight", self->ddId).detail("Primary", true).detail("TotalBytes", 0).detail("UnhealthyServers", 0).detail("HighestPriority", 0).trackLatest("TotalDataInFlight"); + TraceEvent("TotalDataInFlight", self->ddId).detail("Primary", false).detail("TotalBytes", 0).detail("UnhealthyServers", 0).detail("HighestPriority", configuration.usableRegions > 1 ? 0 : -1).trackLatest("TotalDataInFlightRemote"); wait( waitForDataDistributionEnabled(cx) ); TraceEvent("DataDistributionEnabled"); @@ -3573,12 +3558,12 @@ ACTOR Future dataDistribution( state Reference shardsAffectedByTeamFailure( new ShardsAffectedByTeamFailure ); state int shard = 0; - for(; shardshards.size() - 1; shard++) { + for (; shard < initData->shards.size() - 1; shard++) { KeyRangeRef keys = KeyRangeRef(initData->shards[shard].key, initData->shards[shard+1].key); shardsAffectedByTeamFailure->defineShard(keys); std::vector teams; teams.push_back(ShardsAffectedByTeamFailure::Team(initData->shards[shard].primarySrc, true)); - if(configuration.usableRegions > 1) { + if (configuration.usableRegions > 1) { teams.push_back(ShardsAffectedByTeamFailure::Team(initData->shards[shard].remoteSrc, false)); } if(g_network->isSimulated()) { @@ -3587,11 +3572,11 @@ ACTOR Future dataDistribution( } shardsAffectedByTeamFailure->moveShard(keys, teams); - if(initData->shards[shard].hasDest) { + if (initData->shards[shard].hasDest) { // This shard is already in flight. Ideally we should use dest in sABTF and generate a dataDistributionRelocator directly in // DataDistributionQueue to track it, but it's easier to just (with low priority) schedule it for movement. bool unhealthy = initData->shards[shard].primarySrc.size() != configuration.storageTeamSize; - if(!unhealthy && configuration.usableRegions > 1) { + if (!unhealthy && configuration.usableRegions > 1) { unhealthy = initData->shards[shard].remoteSrc.size() != configuration.storageTeamSize; } output.send( RelocateShard( keys, unhealthy ? PRIORITY_TEAM_UNHEALTHY : PRIORITY_RECOVER_MOVE ) ); @@ -3620,20 +3605,20 @@ ACTOR Future dataDistribution( } actors.push_back( pollMoveKeysLock(cx, lock) ); - actors.push_back( reportErrorsExcept( dataDistributionTracker( initData, cx, output, shardsAffectedByTeamFailure, getShardMetrics, getAverageShardBytes.getFuture(), readyToStart, anyZeroHealthyTeams, myId ), "DDTracker", myId, &normalDDQueueErrors() ) ); - actors.push_back( reportErrorsExcept( dataDistributionQueue( cx, output, input.getFuture(), getShardMetrics, processingUnhealthy, tcis, shardsAffectedByTeamFailure, lock, getAverageShardBytes, myId, storageTeamSize, lastLimited ), "DDQueue", myId, &normalDDQueueErrors() ) ); + actors.push_back( reportErrorsExcept( dataDistributionTracker( initData, cx, output, shardsAffectedByTeamFailure, getShardMetrics, getAverageShardBytes.getFuture(), readyToStart, anyZeroHealthyTeams, self->ddId ), "DDTracker", self->ddId, &normalDDQueueErrors() ) ); + actors.push_back( reportErrorsExcept( dataDistributionQueue( cx, output, input.getFuture(), getShardMetrics, processingUnhealthy, tcis, shardsAffectedByTeamFailure, lock, getAverageShardBytes, self->ddId, storageTeamSize, lastLimited ), "DDQueue", self->ddId, &normalDDQueueErrors() ) ); vector teamCollectionsPtrs; - Reference primaryTeamCollection( new DDTeamCollection(cx, myId, lock, output, shardsAffectedByTeamFailure, configuration, primaryDcId, configuration.usableRegions > 1 ? remoteDcIds : std::vector>(), serverChanges, readyToStart.getFuture(), zeroHealthyTeams[0], true, processingUnhealthy) ); + Reference primaryTeamCollection( new DDTeamCollection(cx, self->ddId, lock, output, shardsAffectedByTeamFailure, configuration, self->primaryDcId, configuration.usableRegions > 1 ? self->remoteDcIds : std::vector>(), readyToStart.getFuture(), zeroHealthyTeams[0], true, processingUnhealthy) ); teamCollectionsPtrs.push_back(primaryTeamCollection.getPtr()); if (configuration.usableRegions > 1) { - Reference remoteTeamCollection( new DDTeamCollection(cx, myId, lock, output, shardsAffectedByTeamFailure, configuration, remoteDcIds, Optional>>(), serverChanges, readyToStart.getFuture() && remoteRecovered(db), zeroHealthyTeams[1], false, processingUnhealthy) ); + Reference remoteTeamCollection( new DDTeamCollection(cx, self->ddId, lock, output, shardsAffectedByTeamFailure, configuration, self->remoteDcIds, Optional>>(), readyToStart.getFuture() && remoteRecovered(self->dbInfo), zeroHealthyTeams[1], false, processingUnhealthy) ); teamCollectionsPtrs.push_back(remoteTeamCollection.getPtr()); remoteTeamCollection->teamCollections = teamCollectionsPtrs; - actors.push_back( reportErrorsExcept( dataDistributionTeamCollection( remoteTeamCollection, initData, tcis[1], db ), "DDTeamCollectionSecondary", myId, &normalDDQueueErrors() ) ); + actors.push_back( reportErrorsExcept( dataDistributionTeamCollection( remoteTeamCollection, initData, tcis[1], self->dbInfo ), "DDTeamCollectionSecondary", self->ddId, &normalDDQueueErrors() ) ); } primaryTeamCollection->teamCollections = teamCollectionsPtrs; - actors.push_back( reportErrorsExcept( dataDistributionTeamCollection( primaryTeamCollection, initData, tcis[0], db ), "DDTeamCollectionPrimary", myId, &normalDDQueueErrors() ) ); + actors.push_back( reportErrorsExcept( dataDistributionTeamCollection( primaryTeamCollection, initData, tcis[0], self->dbInfo ), "DDTeamCollectionPrimary", self->ddId, &normalDDQueueErrors() ) ); actors.push_back(yieldPromiseStream(output.getFuture(), input)); wait( waitForAll( actors ) ); @@ -3654,7 +3639,6 @@ ACTOR Future dataDistribution( struct DataDistributorData : NonCopyable, ReferenceCounted { Reference> dbInfo; UID ddId; - PromiseStream< std::pair> > ddStorageServerChanges; PromiseStream> addActor; DataDistributorData(Reference> const& db, UID id) : dbInfo(db), ddId(id) {} @@ -3672,19 +3656,7 @@ static std::set const& normalDataDistributorErrors() { return s; } -static std::set const& normalRateKeeperErrors() { - static std::set s; - if (s.empty()) { - s.insert( error_code_worker_removed ); - s.insert( error_code_broken_promise ); - s.insert( error_code_actor_cancelled ); - s.insert( error_code_please_reboot ); - } - return s; -} - ACTOR Future dataDistributor(DataDistributorInterface di, Reference> db ) { - state UID lastClusterControllerID(0,0); state Reference self( new DataDistributorData(db, di.id()) ); state Future collection = actorCollection( self->addActor.getFuture() ); @@ -3693,10 +3665,8 @@ ACTOR Future dataDistributor(DataDistributorInterface di, Reference> > ddStorageServerChanges; state double lastLimited = 0; - state Future distributor = reportErrorsExcept( dataDistribution( self->dbInfo, di.id(), ddStorageServerChanges, &lastLimited ), "DataDistribution", di.id(), &normalDataDistributorErrors() ); - self->addActor.send( reportErrorsExcept( rateKeeper( self->dbInfo, ddStorageServerChanges, di.getRateInfo.getFuture(), &lastLimited ), "Ratekeeper", di.id(), &normalRateKeeperErrors() ) ); + state Future distributor = reportErrorsExcept( dataDistribution( self->dbInfo, &lastLimited ), "DataDistribution", di.id(), &normalDataDistributorErrors() ); wait( distributor || collection ); } @@ -3732,7 +3702,6 @@ DDTeamCollection* testTeamCollection(int teamSize, IRepPolicyRef policy, int pro conf, {}, {}, - PromiseStream>>(), Future(Void()), Reference>( new AsyncVar(true) ), true, @@ -3765,7 +3734,7 @@ DDTeamCollection* testMachineTeamCollection(int teamSize, IRepPolicyRef policy, DDTeamCollection* collection = new DDTeamCollection(database, UID(0, 0), MoveKeysLock(), PromiseStream(), Reference(new ShardsAffectedByTeamFailure()), conf, {}, {}, - PromiseStream>>(), Future(Void()), + Future(Void()), Reference>(new AsyncVar(true)), true, Reference>(new AsyncVar(false))); diff --git a/fdbserver/DataDistribution.actor.h b/fdbserver/DataDistribution.actor.h index b838217192..1baff7f58e 100644 --- a/fdbserver/DataDistribution.actor.h +++ b/fdbserver/DataDistribution.actor.h @@ -253,5 +253,6 @@ int64_t getMaxShardSize( double dbSizeEstimate ); class DDTeamCollection; ACTOR Future teamRemover(DDTeamCollection* self); ACTOR Future teamRemoverPeriodic(DDTeamCollection* self); +ACTOR Future>> getServerListAndProcessClasses(Transaction* tr); #endif diff --git a/fdbserver/DataDistributorInterface.h b/fdbserver/DataDistributorInterface.h index 2150eb08e5..d437fc69ae 100644 --- a/fdbserver/DataDistributorInterface.h +++ b/fdbserver/DataDistributorInterface.h @@ -27,15 +27,14 @@ struct DataDistributorInterface { RequestStream> waitFailure; - RequestStream getRateInfo; struct LocalityData locality; DataDistributorInterface() {} explicit DataDistributorInterface(const struct LocalityData& l) : locality(l) {} void initEndpoints() {} - UID id() const { return getRateInfo.getEndpoint().token; } - NetworkAddress address() const { return getRateInfo.getEndpoint().getPrimaryAddress(); } + UID id() const { return waitFailure.getEndpoint().token; } + NetworkAddress address() const { return waitFailure.getEndpoint().getPrimaryAddress(); } bool operator== (const DataDistributorInterface& r) const { return id() == r.id(); } @@ -45,36 +44,7 @@ struct DataDistributorInterface { template void serialize(Archive& ar) { - serializer(ar, waitFailure, getRateInfo, locality); - } -}; - -struct GetRateInfoRequest { - UID requesterID; - int64_t totalReleasedTransactions; - int64_t batchReleasedTransactions; - bool detailed; - ReplyPromise reply; - - GetRateInfoRequest() {} - GetRateInfoRequest(UID const& requesterID, int64_t totalReleasedTransactions, int64_t batchReleasedTransactions, bool detailed) - : requesterID(requesterID), totalReleasedTransactions(totalReleasedTransactions), batchReleasedTransactions(batchReleasedTransactions), detailed(detailed) {} - - template - void serialize(Ar& ar) { - serializer(ar, requesterID, totalReleasedTransactions, batchReleasedTransactions, detailed, reply); - } -}; - -struct GetRateInfoReply { - double transactionRate; - double batchTransactionRate; - double leaseDuration; - HealthMetrics healthMetrics; - - template - void serialize(Ar& ar) { - serializer(ar, transactionRate, batchTransactionRate, leaseDuration, healthMetrics); + serializer(ar, waitFailure, locality); } }; diff --git a/fdbserver/Knobs.cpp b/fdbserver/Knobs.cpp index 8b37bb55fb..815d933211 100644 --- a/fdbserver/Knobs.cpp +++ b/fdbserver/Knobs.cpp @@ -307,11 +307,13 @@ ServerKnobs::ServerKnobs(bool randomize, ClientKnobs* clientKnobs) { init( WAIT_FOR_GOOD_REMOTE_RECRUITMENT_DELAY, 5.0 ); init( ATTEMPT_RECRUITMENT_DELAY, 0.035 ); init( WAIT_FOR_DISTRIBUTOR_JOIN_DELAY, 1.0 ); + init( WAIT_FOR_RATEKEEPER_JOIN_DELAY, 1.0 ); init( WORKER_FAILURE_TIME, 1.0 ); if( randomize && BUGGIFY ) WORKER_FAILURE_TIME = 10.0; init( CHECK_OUTSTANDING_INTERVAL, 0.5 ); if( randomize && BUGGIFY ) CHECK_OUTSTANDING_INTERVAL = 0.001; init( VERSION_LAG_METRIC_INTERVAL, 0.5 ); if( randomize && BUGGIFY ) VERSION_LAG_METRIC_INTERVAL = 10.0; init( MAX_VERSION_DIFFERENCE, 20 * VERSIONS_PER_SECOND ); init( FORCE_RECOVERY_CHECK_DELAY, 5.0 ); + init( RATEKEEPER_FAILURE_TIME, 1.0 ); init( INCOMPATIBLE_PEERS_LOGGING_INTERVAL, 600 ); if( randomize && BUGGIFY ) INCOMPATIBLE_PEERS_LOGGING_INTERVAL = 60.0; init( EXPECTED_MASTER_FITNESS, ProcessClass::UnsetFit ); diff --git a/fdbserver/Knobs.h b/fdbserver/Knobs.h index f3698b3561..b2b77861db 100644 --- a/fdbserver/Knobs.h +++ b/fdbserver/Knobs.h @@ -248,12 +248,14 @@ public: double WAIT_FOR_GOOD_REMOTE_RECRUITMENT_DELAY; double ATTEMPT_RECRUITMENT_DELAY; double WAIT_FOR_DISTRIBUTOR_JOIN_DELAY; + double WAIT_FOR_RATEKEEPER_JOIN_DELAY; double WORKER_FAILURE_TIME; double CHECK_OUTSTANDING_INTERVAL; double INCOMPATIBLE_PEERS_LOGGING_INTERVAL; double VERSION_LAG_METRIC_INTERVAL; int64_t MAX_VERSION_DIFFERENCE; double FORCE_RECOVERY_CHECK_DELAY; + double RATEKEEPER_FAILURE_TIME; // Knobs used to select the best policy (via monte carlo) int POLICY_RATING_TESTS; // number of tests per policy (in order to compare) diff --git a/fdbserver/MasterProxyServer.actor.cpp b/fdbserver/MasterProxyServer.actor.cpp index 8755a69069..2b3299572e 100644 --- a/fdbserver/MasterProxyServer.actor.cpp +++ b/fdbserver/MasterProxyServer.actor.cpp @@ -97,18 +97,15 @@ ACTOR Future getRate(UID myID, Reference> db, int64 state int64_t lastTC = 0; - if (db->get().distributor.present()) { - nextRequestTimer = Void(); - } - + if (db->get().ratekeeper.present()) nextRequestTimer = Void(); loop choose { when ( wait( db->onChange() ) ) { - if ( db->get().distributor.present() ) { - TraceEvent("Proxy_DataDistributorChanged", myID) - .detail("DDID", db->get().distributor.get().id()); - nextRequestTimer = Void(); // trigger GetRate request + if ( db->get().ratekeeper.present() ) { + TraceEvent("Proxy_RatekeeperChanged", myID) + .detail("RKID", db->get().ratekeeper.get().id()); + nextRequestTimer = Void(); // trigger GetRate request } else { - TraceEvent("Proxy_DataDistributorDied", myID); + TraceEvent("Proxy_RatekeeperDied", myID); nextRequestTimer = Never(); reply = Never(); } @@ -116,7 +113,7 @@ ACTOR Future getRate(UID myID, Reference> db, int64 when ( wait( nextRequestTimer ) ) { nextRequestTimer = Never(); bool detailed = now() - lastDetailedReply > SERVER_KNOBS->DETAILED_METRIC_UPDATE_RATE; - reply = brokenPromiseToNever(db->get().distributor.get().getRateInfo.getReply(GetRateInfoRequest(myID, *inTransactionCount, *inBatchTransactionCount, detailed))); + reply = brokenPromiseToNever(db->get().ratekeeper.get().getRateInfo.getReply(GetRateInfoRequest(myID, *inTransactionCount, *inBatchTransactionCount, detailed))); expectingDetailedReply = detailed; } when ( GetRateInfoReply rep = wait(reply) ) { diff --git a/fdbserver/Ratekeeper.actor.cpp b/fdbserver/Ratekeeper.actor.cpp index 1baf876d51..83a8778411 100644 --- a/fdbserver/Ratekeeper.actor.cpp +++ b/fdbserver/Ratekeeper.actor.cpp @@ -19,13 +19,14 @@ */ #include "flow/IndexedSet.h" -#include "fdbserver/Ratekeeper.h" #include "fdbrpc/FailureMonitor.h" -#include "fdbserver/Knobs.h" #include "fdbrpc/Smoother.h" -#include "fdbserver/ServerDBInfo.h" #include "fdbrpc/simulator.h" #include "fdbclient/ReadYourWrites.h" +#include "fdbserver/Knobs.h" +#include "fdbserver/DataDistribution.h" +#include "fdbserver/ServerDBInfo.h" +#include "fdbserver/WaitFailure.h" #include "flow/actorcompiler.h" // This must be the last #include. enum limitReason_t { @@ -146,7 +147,7 @@ struct TransactionCounts { TransactionCounts() : total(0), batch(0), time(0) {} }; -struct Ratekeeper { +struct RatekeeperData { Map storageQueueInfo; Map tlogQueueInfo; @@ -154,6 +155,7 @@ struct Ratekeeper { Smoother smoothReleasedTransactions, smoothBatchReleasedTransactions, smoothTotalDurableBytes; HealthMetrics healthMetrics; DatabaseConfiguration configuration; + PromiseStream> addActor; Int64MetricHandle actualTpsMetric; @@ -163,7 +165,7 @@ struct Ratekeeper { RatekeeperLimits normalLimits; RatekeeperLimits batchLimits; - Ratekeeper() : smoothReleasedTransactions(SERVER_KNOBS->SMOOTHING_AMOUNT), smoothBatchReleasedTransactions(SERVER_KNOBS->SMOOTHING_AMOUNT), smoothTotalDurableBytes(SERVER_KNOBS->SLOW_SMOOTHING_AMOUNT), + RatekeeperData() : smoothReleasedTransactions(SERVER_KNOBS->SMOOTHING_AMOUNT), smoothBatchReleasedTransactions(SERVER_KNOBS->SMOOTHING_AMOUNT), smoothTotalDurableBytes(SERVER_KNOBS->SLOW_SMOOTHING_AMOUNT), actualTpsMetric(LiteralStringRef("Ratekeeper.ActualTPS")), lastWarning(0), normalLimits("", SERVER_KNOBS->TARGET_BYTES_PER_STORAGE_SERVER, SERVER_KNOBS->SPRING_BYTES_STORAGE_SERVER, SERVER_KNOBS->TARGET_BYTES_PER_TLOG, SERVER_KNOBS->SPRING_BYTES_TLOG, SERVER_KNOBS->MAX_TL_SS_VERSION_DIFFERENCE), @@ -172,7 +174,7 @@ struct Ratekeeper { }; //SOMEDAY: template trackStorageServerQueueInfo and trackTLogQueueInfo into one function -ACTOR Future trackStorageServerQueueInfo( Ratekeeper* self, StorageServerInterface ssi ) { +ACTOR Future trackStorageServerQueueInfo( RatekeeperData* self, StorageServerInterface ssi ) { self->storageQueueInfo.insert( mapPair(ssi.id(), StorageQueueInfo(ssi.id(), ssi.locality) ) ); state Map::iterator myQueueInfo = self->storageQueueInfo.find(ssi.id()); TraceEvent("RkTracking", ssi.id()); @@ -217,7 +219,7 @@ ACTOR Future trackStorageServerQueueInfo( Ratekeeper* self, StorageServerI } } -ACTOR Future trackTLogQueueInfo( Ratekeeper* self, TLogInterface tli ) { +ACTOR Future trackTLogQueueInfo( RatekeeperData* self, TLogInterface tli ) { self->tlogQueueInfo.insert( mapPair(tli.id(), TLogQueueInfo(tli.id()) ) ); state Map::iterator myQueueInfo = self->tlogQueueInfo.find(tli.id()); TraceEvent("RkTracking", tli.id()); @@ -270,7 +272,7 @@ ACTOR Future splitError( Future in, Promise errOut ) { } ACTOR Future trackEachStorageServer( - Ratekeeper* self, + RatekeeperData* self, FutureStream< std::pair> > serverChanges ) { state Map> actors; @@ -289,7 +291,59 @@ ACTOR Future trackEachStorageServer( } } -void updateRate( Ratekeeper* self, RatekeeperLimits &limits ) { +ACTOR Future monitorServerListChange( + Reference> dbInfo, + PromiseStream< std::pair> > serverChanges) { + state Database db = openDBOnServer(dbInfo, TaskRateKeeper, true, true); + state Future checkSignal = delay(SERVER_KNOBS->SERVER_LIST_DELAY); + state Future>> serverListAndProcessClasses = Never(); + state std::map oldServers; + state Transaction tr(db); + + loop { + try { + choose { + when ( wait( checkSignal ) ) { + checkSignal = Never(); + serverListAndProcessClasses = getServerListAndProcessClasses(&tr); + } + when ( vector> results = wait( serverListAndProcessClasses ) ) { + serverListAndProcessClasses = Never(); + + std::map newServers; + for( int i = 0; i < results.size(); i++ ) { + UID serverId = results[i].first.id(); + StorageServerInterface const& ssi = results[i].first; + newServers[serverId] = ssi; + + if ( oldServers.count( serverId ) ) { + if (ssi.getValue.getEndpoint() != oldServers[serverId].getValue.getEndpoint()) { + serverChanges.send( std::make_pair(serverId, Optional(ssi)) ); + } + oldServers.erase(serverId); + } else { + serverChanges.send( std::make_pair(serverId, Optional(ssi)) ); + } + } + + for (auto it : oldServers) { + serverChanges.send( std::make_pair(it.first, Optional()) ); + } + + oldServers.swap(newServers); + tr = Transaction(db); + checkSignal = delay(SERVER_KNOBS->SERVER_LIST_DELAY); + } + } + } catch(Error& e) { + wait( tr.onError(e) ); + serverListAndProcessClasses = Never(); + checkSignal = Void(); + } + } +} + +void updateRate( RatekeeperData* self, RatekeeperLimits &limits ) { //double controlFactor = ; // dt / eFoldingTime double actualTps = self->smoothReleasedTransactions.smoothRate(); @@ -566,7 +620,7 @@ void updateRate( Ratekeeper* self, RatekeeperLimits &limits ) { } } -ACTOR Future configurationMonitor( Ratekeeper* self, Reference> dbInfo ) { +ACTOR Future configurationMonitor(Reference> dbInfo, DatabaseConfiguration* conf) { state Database cx = openDBOnServer(dbInfo, TaskDefaultEndpoint, true, true); loop { state ReadYourWritesTransaction tr(cx); @@ -578,7 +632,7 @@ ACTOR Future configurationMonitor( Ratekeeper* self, Reference results = wait( tr.getRange( configKeys, CLIENT_KNOBS->TOO_MANY ) ); ASSERT( !results.more && results.size() < CLIENT_KNOBS->TOO_MANY ); - self->configuration.fromKeyValues( (VectorRef) results ); + conf->fromKeyValues( (VectorRef) results ); state Future watchFuture = tr.watch(moveKeysLockOwnerKey); wait( tr.commit() ); @@ -591,21 +645,26 @@ ACTOR Future configurationMonitor( Ratekeeper* self, Reference rateKeeper( - Reference> dbInfo, - PromiseStream< std::pair> > serverChanges, - FutureStream< struct GetRateInfoRequest > getRateInfo, - double* lastLimited) -{ - state Ratekeeper self; - state Future track = trackEachStorageServer( &self, serverChanges.getFuture() ); +ACTOR Future rateKeeper(RatekeeperInterface rkInterf, Reference> dbInfo) { + state RatekeeperData self; state Future timeout = Void(); state std::vector> actors; state std::vector> tlogTrackers; state std::vector tlogInterfs; state Promise err; - state Future configMonitor = configurationMonitor(&self, dbInfo); - self.lastLimited = lastLimited; + state Future collection = actorCollection( self.addActor.getFuture() ); + + // TODOs: + double lastLimited; + self.lastLimited = &lastLimited; + + TraceEvent("Ratekeeper_Starting", rkInterf.id()); + self.addActor.send( waitFailureServer(rkInterf.waitFailure.getFuture()) ); + self.addActor.send( configurationMonitor(dbInfo, &self.configuration) ); + + PromiseStream< std::pair> > serverChanges; + self.addActor.send( monitorServerListChange(dbInfo, serverChanges) ); + self.addActor.send( trackEachStorageServer(&self, serverChanges.getFuture()) ); TraceEvent("RkTLogQueueSizeParameters").detail("Target", SERVER_KNOBS->TARGET_BYTES_PER_TLOG).detail("Spring", SERVER_KNOBS->SPRING_BYTES_TLOG) .detail("Rate", (SERVER_KNOBS->TARGET_BYTES_PER_TLOG - SERVER_KNOBS->SPRING_BYTES_TLOG) / ((((double)SERVER_KNOBS->MAX_READ_TRANSACTION_LIFE_VERSIONS) / SERVER_KNOBS->VERSIONS_PER_SECOND) + 2.0)); @@ -619,7 +678,6 @@ ACTOR Future rateKeeper( loop{ choose { - when (wait( track )) { break; } when (wait( timeout )) { updateRate(&self, self.normalLimits); updateRate(&self, self.batchLimits); @@ -638,7 +696,7 @@ ACTOR Future rateKeeper( } timeout = delayJittered(SERVER_KNOBS->METRIC_UPDATE_RATE); } - when (GetRateInfoRequest req = waitNext(getRateInfo)) { + when (GetRateInfoRequest req = waitNext(rkInterf.getRateInfo.getFuture())) { GetRateInfoReply reply; auto& p = self.proxy_transactionCounts[ req.requesterID ]; @@ -672,8 +730,10 @@ ACTOR Future rateKeeper( tlogTrackers.push_back( splitError( trackTLogQueueInfo(&self, tlogInterfs[i]), err ) ); } } - when(wait(configMonitor)) {} + when ( wait(collection) ) { + ASSERT(false); + throw internal_error(); + } } } - return Void(); } diff --git a/fdbserver/Ratekeeper.h b/fdbserver/Ratekeeper.h deleted file mode 100644 index 282e99f766..0000000000 --- a/fdbserver/Ratekeeper.h +++ /dev/null @@ -1,35 +0,0 @@ -/* - * Ratekeeper.h - * - * This source file is part of the FoundationDB open source project - * - * Copyright 2013-2018 Apple Inc. and the FoundationDB project authors - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#ifndef FDBSERVER_RATEKEEPER_H -#define FDBSERVER_RATEKEEPER_H -#pragma once - -#include "fdbserver/MasterInterface.h" -#include "fdbserver/TLogInterface.h" -#include "fdbclient/DatabaseConfiguration.h" - -Future rateKeeper( - Reference> const& dbInfo, - PromiseStream< std::pair> > const& serverChanges, // actually an input, but we don't want broken_promise - FutureStream< struct GetRateInfoRequest > const& getRateInfo, - double* const& lastLimited); - -#endif diff --git a/fdbserver/RatekeeperInterface.h b/fdbserver/RatekeeperInterface.h new file mode 100644 index 0000000000..cb5049a595 --- /dev/null +++ b/fdbserver/RatekeeperInterface.h @@ -0,0 +1,93 @@ +/* + * DataDistributorInterface.h + * + * This source file is part of the FoundationDB open source project + * + * Copyright 2013-2019 Apple Inc. and the FoundationDB project authors + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef FDBSERVER_RATEKEEPERINTERFACE_H +#define FDBSERVER_RATEKEEPERINTERFACE_H + +#include "fdbclient/StorageServerInterface.h" +#include "fdbclient/FDBTypes.h" +#include "fdbrpc/fdbrpc.h" +#include "fdbrpc/Locality.h" + +struct RatekeeperInterface { + RequestStream> waitFailure; + RequestStream getRateInfo; + RequestStream changeStorage; + struct LocalityData locality; + + RatekeeperInterface() {} + explicit RatekeeperInterface(const struct LocalityData& l) : locality(l) {} + + void initEndpoints() {} + UID id() const { return getRateInfo.getEndpoint().token; } + NetworkAddress address() const { return getRateInfo.getEndpoint().address; } + bool operator== (const RatekeeperInterface& r) const { + return id() == r.id(); + } + bool operator!= (const RatekeeperInterface& r) const { + return !(*this == r); + } + + template + void serialize(Archive& ar) { + serializer(ar, waitFailure, getRateInfo, changeStorage, locality); + } +}; + +struct GetRateInfoRequest { + UID requesterID; + int64_t totalReleasedTransactions; + int64_t batchReleasedTransactions; + bool detailed; + ReplyPromise reply; + + GetRateInfoRequest() {} + GetRateInfoRequest(UID const& requesterID, int64_t totalReleasedTransactions, int64_t batchReleasedTransactions, bool detailed) + : requesterID(requesterID), totalReleasedTransactions(totalReleasedTransactions), batchReleasedTransactions(batchReleasedTransactions), detailed(detailed) {} + + template + void serialize(Ar& ar) { + serializer(ar, requesterID, totalReleasedTransactions, batchReleasedTransactions, detailed, reply); + } +}; + +struct GetRateInfoReply { + double transactionRate; + double batchTransactionRate; + double leaseDuration; + HealthMetrics healthMetrics; + + template + void serialize(Ar& ar) { + serializer(ar, transactionRate, batchTransactionRate, leaseDuration, healthMetrics); + } +}; + +struct StorageChangeRequest { + UID ssID; + Optional ssInterf; + + template + void serialize(Ar& ar) { + serializer(ar, ssID, ssInterf); + } +}; + +#endif //FDBSERVER_RATEKEEPERINTERFACE_H diff --git a/fdbserver/ServerDBInfo.h b/fdbserver/ServerDBInfo.h index abb7be412c..c5a76f831b 100644 --- a/fdbserver/ServerDBInfo.h +++ b/fdbserver/ServerDBInfo.h @@ -26,6 +26,7 @@ #include "fdbserver/DataDistributorInterface.h" #include "fdbserver/MasterInterface.h" #include "fdbserver/LogSystemConfig.h" +#include "fdbserver/RatekeeperInterface.h" #include "fdbserver/RecoveryState.h" #include "fdbserver/LatencyBandConfig.h" @@ -39,6 +40,7 @@ struct ServerDBInfo { ClientDBInfo client; // After a successful recovery, eventually proxies that communicate with it Optional distributor; // The best guess of current data distributor. MasterInterface master; // The best guess as to the most recent master, which might still be recovering + Optional ratekeeper; vector resolvers; DBRecoveryCount recoveryCount; // A recovery count from DBCoreState. A successful master recovery increments it twice; unsuccessful recoveries may increment it once. Depending on where the current master is in its recovery process, this might not have been written by the current master. RecoveryState recoveryState; diff --git a/fdbserver/WorkerInterface.actor.h b/fdbserver/WorkerInterface.actor.h index c3f6b1bb49..494ff3fcd3 100644 --- a/fdbserver/WorkerInterface.actor.h +++ b/fdbserver/WorkerInterface.actor.h @@ -28,6 +28,7 @@ #include "fdbserver/DataDistributorInterface.h" #include "fdbserver/MasterInterface.h" #include "fdbserver/TLogInterface.h" +#include "fdbserver/RatekeeperInterface.h" #include "fdbserver/ResolverInterface.h" #include "fdbclient/StorageServerInterface.h" #include "fdbserver/TesterInterface.actor.h" @@ -46,6 +47,7 @@ struct WorkerInterface { RequestStream< struct RecruitMasterRequest > master; RequestStream< struct InitializeMasterProxyRequest > masterProxy; RequestStream< struct InitializeDataDistributorRequest > dataDistributor; + RequestStream< struct InitializeRatekeeperRequest > ratekeeper; RequestStream< struct InitializeResolverRequest > resolver; RequestStream< struct InitializeStorageRequest > storage; RequestStream< struct InitializeLogRouterRequest > logRouter; @@ -68,7 +70,7 @@ struct WorkerInterface { template void serialize(Ar& ar) { - serializer(ar, clientInterface, locality, tLog, master, masterProxy, dataDistributor, resolver, storage, logRouter, debugPing, coordinationPing, waitFailure, setMetricsRate, eventLogRequest, traceBatchDumpRequest, testerInterface, diskStoreRequest); + serializer(ar, clientInterface, locality, tLog, master, masterProxy, dataDistributor, ratekeeper, resolver, storage, logRouter, debugPing, coordinationPing, waitFailure, setMetricsRate, eventLogRequest, traceBatchDumpRequest, testerInterface, diskStoreRequest); } }; @@ -151,6 +153,16 @@ struct InitializeDataDistributorRequest { } }; +struct InitializeRatekeeperRequest { + UID reqId; + ReplyPromise reply; + + template + void serialize(Ar& ar) { + serializer(ar, reqId, reply); + } +}; + struct InitializeResolverRequest { uint64_t recoveryCount; int proxyCount; @@ -300,6 +312,7 @@ struct Role { static const Role TESTER; static const Role LOG_ROUTER; static const Role DATA_DISTRIBUTOR; + static const Role RATE_KEEPER; std::string roleName; std::string abbreviation; @@ -361,6 +374,7 @@ ACTOR Future resolver(ResolverInterface proxy, InitializeResolverRequest i ACTOR Future logRouter(TLogInterface interf, InitializeLogRouterRequest req, Reference> db); ACTOR Future dataDistributor(DataDistributorInterface ddi, Reference> db); +ACTOR Future rateKeeper(RatekeeperInterface const& rki, Reference> const& db); void registerThreadForProfiling(); void updateCpuProfiler(ProfilerRequest req); diff --git a/fdbserver/fdbserver.vcxproj b/fdbserver/fdbserver.vcxproj index 36e73c121a..483e0e5aec 100644 --- a/fdbserver/fdbserver.vcxproj +++ b/fdbserver/fdbserver.vcxproj @@ -191,7 +191,7 @@ false - + diff --git a/fdbserver/fdbserver.vcxproj.filters b/fdbserver/fdbserver.vcxproj.filters index 9c27ac6fad..3395d97ab0 100644 --- a/fdbserver/fdbserver.vcxproj.filters +++ b/fdbserver/fdbserver.vcxproj.filters @@ -310,6 +310,7 @@ + @@ -343,7 +344,7 @@ - + diff --git a/fdbserver/masterserver.actor.cpp b/fdbserver/masterserver.actor.cpp index 6460f49403..d242ec446c 100644 --- a/fdbserver/masterserver.actor.cpp +++ b/fdbserver/masterserver.actor.cpp @@ -31,7 +31,6 @@ #include #include "fdbserver/WaitFailure.h" #include "fdbserver/WorkerInterface.actor.h" -#include "fdbserver/Ratekeeper.h" #include "fdbserver/ClusterRecruitmentInterface.h" #include "fdbserver/ServerDBInfo.h" #include "fdbserver/CoordinatedState.h" diff --git a/fdbserver/worker.actor.cpp b/fdbserver/worker.actor.cpp index c4b0dd0def..88d8ec85d0 100644 --- a/fdbserver/worker.actor.cpp +++ b/fdbserver/worker.actor.cpp @@ -349,14 +349,15 @@ ACTOR Future registrationClient( WorkerInterface interf, Reference> asyncPriorityInfo, ProcessClass initialClass, - Reference>> ddInterf) { + Reference>> ddInterf, + Reference>> rkInterf) { // Keeps the cluster controller (as it may be re-elected) informed that this worker exists // The cluster controller uses waitFailureClient to find out if we die, and returns from registrationReply (requiring us to re-register) // The registration request piggybacks optional distributor interface if it exists. state Generation requestGeneration = 0; state ProcessClass processClass = initialClass; loop { - RegisterWorkerRequest request(interf, initialClass, processClass, asyncPriorityInfo->get(), requestGeneration++, ddInterf->get()); + RegisterWorkerRequest request(interf, initialClass, processClass, asyncPriorityInfo->get(), requestGeneration++, ddInterf->get(), rkInterf->get()); Future registrationReply = ccInterface->get().present() ? brokenPromiseToNever( ccInterface->get().get().registerWorker.getReply(request) ) : Never(); choose { when ( RegisterWorkerReply reply = wait( registrationReply )) { @@ -365,6 +366,7 @@ ACTOR Future registrationClient( } when ( wait( ccInterface->onChange() )) { } when ( wait( ddInterf->onChange() ) ) {} + when ( wait( rkInterf->onChange() ) ) {} } } } @@ -610,6 +612,7 @@ ACTOR Future workerServer( Reference connFile, Refe Reference> asyncPriorityInfo, ProcessClass initialClass, std::string folder, int64_t memoryLimit, std::string metricsConnFile, std::string metricsPrefix, Promise recoveredDiskFiles) { state PromiseStream< ErrorInfo > errors; state Reference>> ddInterf( new AsyncVar>() ); + state Reference>> rkInterf( new AsyncVar>() ); state Future handleErrors = workerHandleErrors( errors.getFuture() ); // Needs to be stopped last state ActorCollection errorForwarders(false); state Future loggingTrigger = Void(); @@ -756,7 +759,7 @@ ACTOR Future workerServer( Reference connFile, Refe wait(waitForAll(recoveries)); recoveredDiskFiles.send(Void()); - errorForwarders.add( registrationClient( ccInterface, interf, asyncPriorityInfo, initialClass, ddInterf ) ); + errorForwarders.add( registrationClient( ccInterface, interf, asyncPriorityInfo, initialClass, ddInterf, rkInterf ) ); TraceEvent("RecoveriesComplete", interf.id()); @@ -837,6 +840,22 @@ ACTOR Future workerServer( Reference connFile, Refe TraceEvent("DataDistributorReceived", req.reqId).detail("DataDistributorId", recruited.id()); req.reply.send(recruited); } + when ( InitializeRatekeeperRequest req = waitNext(interf.ratekeeper.getFuture()) ) { + RatekeeperInterface recruited(locality); + recruited.initEndpoints(); + + if (rkInterf->get().present()) { + recruited = rkInterf->get().get(); + TEST(true); // Recruited while already a ratekeeper. + } else { + startRole(Role::RATE_KEEPER, recruited.id(), interf.id()); + Future ratekeeper = rateKeeper( recruited, dbInfo ); + errorForwarders.add( forwardError( errors, Role::RATE_KEEPER, recruited.id(), setWhenDoneOrError( ratekeeper, rkInterf, Optional() ) ) ); + rkInterf->set(Optional(recruited)); + } + TraceEvent("Ratekeeper_InitRequest", req.reqId).detail("RatekeeperId", recruited.id()); + req.reply.send(recruited); + } when( InitializeTLogRequest req = waitNext(interf.tLog.getFuture()) ) { // For now, there's a one-to-one mapping of spill type to TLogVersion. // With future work, a particular version of the TLog can support multiple @@ -1244,3 +1263,4 @@ const Role Role::CLUSTER_CONTROLLER("ClusterController", "CC"); const Role Role::TESTER("Tester", "TS"); const Role Role::LOG_ROUTER("LogRouter", "LR"); const Role Role::DATA_DISTRIBUTOR("DataDistributor", "DD"); +const Role Role::RATE_KEEPER("RateKeeper", "RK"); diff --git a/flow/network.h b/flow/network.h index 3b569da349..0437c5febe 100644 --- a/flow/network.h +++ b/flow/network.h @@ -67,6 +67,7 @@ enum { TaskUnknownEndpoint = 4000, TaskMoveKeys = 3550, TaskDataDistributionLaunch = 3530, + TaskRateKeeper = 3510, TaskDataDistribution = 3500, TaskDiskWrite = 3010, TaskUpdateStorage = 3000, From e6ac3f7fe8bd635c2954a8e3c8d814e69c3a36b4 Mon Sep 17 00:00:00 2001 From: Jingyu Zhou Date: Fri, 15 Feb 2019 17:29:52 -0800 Subject: [PATCH 27/56] Minor fix on ratekeeper work registration. --- fdbserver/ClusterController.actor.cpp | 17 +++++++++----- fdbserver/DataDistribution.actor.cpp | 33 +++++++++++++++++++++++++++ fdbserver/Ratekeeper.actor.cpp | 4 ++-- fdbserver/RatekeeperInterface.h | 13 +---------- 4 files changed, 47 insertions(+), 20 deletions(-) diff --git a/fdbserver/ClusterController.actor.cpp b/fdbserver/ClusterController.actor.cpp index d95b9c1385..be5bfc53e4 100644 --- a/fdbserver/ClusterController.actor.cpp +++ b/fdbserver/ClusterController.actor.cpp @@ -3,7 +3,7 @@ * * This source file is part of the FoundationDB open source project * - * Copyright 2013-2018 Apple Inc. and the FoundationDB project authors + * Copyright 2013-2019 Apple Inc. and the FoundationDB project authors * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -1769,6 +1769,11 @@ void registerWorker( RegisterWorkerRequest req, ClusterControllerData *self ) { TraceEvent("ClusterController_RegisterDataDistributor", self->id).detail("DDID", di.id()); self->db.setDistributor(di); } + if ( req.ratekeeperInterf.present() && !self->db.serverInfo->get().ratekeeper.present() ) { + const RatekeeperInterface& rki = req.ratekeeperInterf.get(); + TraceEvent("ClusterController_RegisterRatekeeper", self->id).detail("RKID", rki.id()); + self->db.setRatekeeper(rki); + } if( info == self->id_worker.end() ) { self->id_worker[w.locality.processId()] = WorkerInfo( workerAvailabilityWatch( w, newProcessClass, self ), req.reply, req.generation, w, req.initialClass, newProcessClass, newPriorityInfo ); checkOutstandingRequests( self ); @@ -2398,7 +2403,7 @@ ACTOR Future startRatekeeper(ClusterControllerData *self) { req.reqId = g_random->randomUniqueID(); TraceEvent("ClusterController_RecruitRatekeeper", req.reqId).detail("Addr", rkWorker.worker.first.address()); - ErrorOr interf = wait( rkWorker.worker.first.ratekeeper.getReplyUnlessFailedFor(req, SERVER_KNOBS->WAIT_FOR_DISTRIBUTOR_JOIN_DELAY, 0) ); + ErrorOr interf = wait( rkWorker.worker.first.ratekeeper.getReplyUnlessFailedFor(req, SERVER_KNOBS->WAIT_FOR_RATEKEEPER_JOIN_DELAY, 0) ); if (interf.present()) { TraceEvent("ClusterController_RatekeeperRecruited", req.reqId).detail("Addr", rkWorker.worker.first.address()); return interf.get(); @@ -2414,7 +2419,7 @@ ACTOR Future startRatekeeper(ClusterControllerData *self) { } } -ACTOR Future waitRKRejoinOrStartRK(ClusterControllerData *self) { +ACTOR Future monitorRatekeeper(ClusterControllerData *self) { state Future initialDelay = delay(SERVER_KNOBS->WAIT_FOR_RATEKEEPER_JOIN_DELAY); // wait for a while to see if an existing ratekeeper will join. @@ -2432,8 +2437,8 @@ ACTOR Future waitRKRejoinOrStartRK(ClusterControllerData *self) { loop { if ( self->db.serverInfo->get().ratekeeper.present() ) { wait( waitFailureClient( self->db.serverInfo->get().ratekeeper.get().waitFailure, SERVER_KNOBS->RATEKEEPER_FAILURE_TIME ) ); - TraceEvent("ClusterController", self->id) - .detail("RatekeeperDied", self->db.serverInfo->get().ratekeeper.get().id()); + TraceEvent("ClusterController_RateKeeperDied", self->id) + .detail("RKID", self->db.serverInfo->get().ratekeeper.get().id()); self->db.clearInterf(ProcessClass::RateKeeperClass); } else { RatekeeperInterface rkInterf = wait( startRatekeeper(self) ); @@ -2460,7 +2465,7 @@ ACTOR Future clusterControllerCore( ClusterControllerFullInterface interf, self.addActor.send( updateDatacenterVersionDifference(&self) ); self.addActor.send( handleForcedRecoveries(&self, interf) ); self.addActor.send( waitDDRejoinOrStartDD(&self) ); - self.addActor.send( waitRKRejoinOrStartRK(&self) ); + self.addActor.send( monitorRatekeeper(&self) ); //printf("%s: I am the cluster controller\n", g_network->getLocalAddress().toString().c_str()); loop choose { diff --git a/fdbserver/DataDistribution.actor.cpp b/fdbserver/DataDistribution.actor.cpp index fe52e5e42a..e1b80d2314 100644 --- a/fdbserver/DataDistribution.actor.cpp +++ b/fdbserver/DataDistribution.actor.cpp @@ -3454,6 +3454,39 @@ ACTOR Future dataDistribution(Reference self, state DatabaseConfiguration configuration = self->configuration->get(); cx->locationCacheSize = SERVER_KNOBS->DD_LOCATION_CACHE_SIZE; +<<<<<<< HEAD +======= + state Transaction tr(cx); + loop { + try { + tr.setOption( FDBTransactionOptions::ACCESS_SYSTEM_KEYS ); + tr.setOption( FDBTransactionOptions::PRIORITY_SYSTEM_IMMEDIATE ); + + Standalone replicaKeys = wait(tr.getRange(datacenterReplicasKeys, CLIENT_KNOBS->TOO_MANY)); + + for(auto& kv : replicaKeys) { + auto dcId = decodeDatacenterReplicasKey(kv.key); + auto replicas = decodeDatacenterReplicasValue(kv.value); + if ((self->primaryDcId.size() && self->primaryDcId[0] == dcId) || + (self->remoteDcIds.size() && self->remoteDcIds[0] == dcId && configuration.usableRegions > 1)) { + if(replicas > configuration.storageTeamSize) { + tr.set(kv.key, datacenterReplicasValue(configuration.storageTeamSize)); + } + } else { + tr.clear(kv.key); + } + } + + wait(tr.commit()); + break; + } + catch(Error &e) { + wait(tr.onError(e)); + } + } + + +>>>>>>> Minor fix on ratekeeper work registration. //cx->setOption( FDBDatabaseOptions::LOCATION_CACHE_SIZE, StringRef((uint8_t*) &SERVER_KNOBS->DD_LOCATION_CACHE_SIZE, 8) ); //ASSERT( cx->locationCacheSize == SERVER_KNOBS->DD_LOCATION_CACHE_SIZE ); diff --git a/fdbserver/Ratekeeper.actor.cpp b/fdbserver/Ratekeeper.actor.cpp index 83a8778411..f872a55566 100644 --- a/fdbserver/Ratekeeper.actor.cpp +++ b/fdbserver/Ratekeeper.actor.cpp @@ -3,7 +3,7 @@ * * This source file is part of the FoundationDB open source project * - * Copyright 2013-2018 Apple Inc. and the FoundationDB project authors + * Copyright 2013-2019 Apple Inc. and the FoundationDB project authors * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -655,7 +655,7 @@ ACTOR Future rateKeeper(RatekeeperInterface rkInterf, Reference collection = actorCollection( self.addActor.getFuture() ); // TODOs: - double lastLimited; + double lastLimited = 0; self.lastLimited = &lastLimited; TraceEvent("Ratekeeper_Starting", rkInterf.id()); diff --git a/fdbserver/RatekeeperInterface.h b/fdbserver/RatekeeperInterface.h index cb5049a595..539aeb8d7f 100644 --- a/fdbserver/RatekeeperInterface.h +++ b/fdbserver/RatekeeperInterface.h @@ -29,7 +29,6 @@ struct RatekeeperInterface { RequestStream> waitFailure; RequestStream getRateInfo; - RequestStream changeStorage; struct LocalityData locality; RatekeeperInterface() {} @@ -47,7 +46,7 @@ struct RatekeeperInterface { template void serialize(Archive& ar) { - serializer(ar, waitFailure, getRateInfo, changeStorage, locality); + serializer(ar, waitFailure, getRateInfo, locality); } }; @@ -80,14 +79,4 @@ struct GetRateInfoReply { } }; -struct StorageChangeRequest { - UID ssID; - Optional ssInterf; - - template - void serialize(Ar& ar) { - serializer(ar, ssID, ssInterf); - } -}; - #endif //FDBSERVER_RATEKEEPERINTERFACE_H From 36a51a7b57d7a6f5f92249edb26c30fa2468f794 Mon Sep 17 00:00:00 2001 From: Jingyu Zhou Date: Mon, 18 Feb 2019 14:57:21 -0800 Subject: [PATCH 28/56] Fix a segfault bug due to uncopied ratekeeper interface --- fdbserver/CMakeLists.txt | 1 - fdbserver/ClusterController.actor.cpp | 11 +++++++-- fdbserver/DataDistribution.actor.cpp | 32 +++++++++++++++++++++++++++ 3 files changed, 41 insertions(+), 3 deletions(-) diff --git a/fdbserver/CMakeLists.txt b/fdbserver/CMakeLists.txt index 58853a2fee..31af61de23 100644 --- a/fdbserver/CMakeLists.txt +++ b/fdbserver/CMakeLists.txt @@ -56,7 +56,6 @@ set(FDBSERVER_SRCS QuietDatabase.actor.cpp QuietDatabase.h Ratekeeper.actor.cpp - Ratekeeper.h RatekeeperInterface.h RecoveryState.h Restore.actor.cpp diff --git a/fdbserver/ClusterController.actor.cpp b/fdbserver/ClusterController.actor.cpp index be5bfc53e4..fef69bc7e6 100644 --- a/fdbserver/ClusterController.actor.cpp +++ b/fdbserver/ClusterController.actor.cpp @@ -936,6 +936,9 @@ public: if (db.serverInfo->get().distributor.present()) { id_used[db.serverInfo->get().distributor.get().locality.processId()]++; } + if (db.serverInfo->get().ratekeeper.present()) { + id_used[db.serverInfo->get().ratekeeper.get().locality.processId()]++; + } WorkerFitnessInfo mworker = getWorkerForRoleInDatacenter(clusterControllerDcId, ProcessClass::Master, ProcessClass::NeverAssign, db.config, id_used, true); if ( oldMasterFit < mworker.fitness ) @@ -1121,6 +1124,9 @@ ACTOR Future clusterWatchDatabase( ClusterControllerData* cluster, Cluster if (cluster->db.serverInfo->get().distributor.present()) { id_used[cluster->db.serverInfo->get().distributor.get().locality.processId()]++; } + if (cluster->db.serverInfo->get().ratekeeper.present()) { + id_used[cluster->db.serverInfo->get().ratekeeper.get().locality.processId()]++; + } state WorkerFitnessInfo masterWorker = cluster->getWorkerForRoleInDatacenter(cluster->clusterControllerDcId, ProcessClass::Master, ProcessClass::NeverAssign, db->config, id_used); if( ( masterWorker.worker.second.machineClassFitness( ProcessClass::Master ) > SERVER_KNOBS->EXPECTED_MASTER_FITNESS || masterWorker.worker.first.locality.processId() == cluster->clusterControllerProcessId ) && now() - cluster->startTime < SERVER_KNOBS->WAIT_FOR_GOOD_RECRUITMENT_DELAY ) { @@ -1156,6 +1162,7 @@ ACTOR Future clusterWatchDatabase( ClusterControllerData* cluster, Cluster ++dbInfo.masterLifetime; dbInfo.clusterInterface = db->serverInfo->get().clusterInterface; dbInfo.distributor = db->serverInfo->get().distributor; + dbInfo.ratekeeper = db->serverInfo->get().ratekeeper; TraceEvent("CCWDB", cluster->id).detail("Lifetime", dbInfo.masterLifetime.toString()).detail("ChangeID", dbInfo.id); db->serverInfo->set( dbInfo ); @@ -2361,7 +2368,7 @@ ACTOR Future startDataDistributor( ClusterControllerDa } } -ACTOR Future waitDDRejoinOrStartDD(ClusterControllerData *self) { +ACTOR Future monitorDataDistributor(ClusterControllerData *self) { state Future initialDelay = delay(SERVER_KNOBS->WAIT_FOR_DISTRIBUTOR_JOIN_DELAY); // wait for a while to see if existing data distributor will join. @@ -2464,7 +2471,7 @@ ACTOR Future clusterControllerCore( ClusterControllerFullInterface interf, self.addActor.send( updatedChangedDatacenters(&self) ); self.addActor.send( updateDatacenterVersionDifference(&self) ); self.addActor.send( handleForcedRecoveries(&self, interf) ); - self.addActor.send( waitDDRejoinOrStartDD(&self) ); + self.addActor.send( monitorDataDistributor(&self) ); self.addActor.send( monitorRatekeeper(&self) ); //printf("%s: I am the cluster controller\n", g_network->getLocalAddress().toString().c_str()); diff --git a/fdbserver/DataDistribution.actor.cpp b/fdbserver/DataDistribution.actor.cpp index e1b80d2314..a46d5d8f01 100644 --- a/fdbserver/DataDistribution.actor.cpp +++ b/fdbserver/DataDistribution.actor.cpp @@ -3447,6 +3447,38 @@ ACTOR Future pollMoveKeysLock( Database cx, MoveKeysLock lock ) { } } +<<<<<<< HEAD +======= +struct DataDistributorData : NonCopyable, ReferenceCounted { + Reference> dbInfo; + Reference> configuration; + std::vector> primaryDcId; + std::vector> remoteDcIds; + UID ddId; + PromiseStream> addActor; + + DataDistributorData(Reference> const& db, Reference> const& dbConfig, UID id) + : dbInfo(db), configuration(dbConfig), ddId(id) {} + + void refreshDcIds() { + primaryDcId.clear(); + remoteDcIds.clear(); + + const std::vector& regions = configuration->get().regions; + TraceEvent ev("DataDistributor", ddId); + if ( regions.size() > 0 ) { + primaryDcId.push_back( regions[0].dcId ); + ev.detail("PrimaryDcID", regions[0].dcId.toHexString()); + } + if ( regions.size() > 1 ) { + remoteDcIds.push_back( regions[1].dcId ); + ev.detail("SecondaryDcID", regions[1].dcId.toHexString()); + } + } +}; + +// TODO: remove lastLimited -- obtain this information of ratekeeper from proxy +>>>>>>> Fix a segfault bug due to uncopied ratekeeper interface ACTOR Future dataDistribution(Reference self, double* lastLimited) { From b2ee41ba33570f1db9b120ec3785f923a6fd68f1 Mon Sep 17 00:00:00 2001 From: Jingyu Zhou Date: Tue, 19 Feb 2019 14:04:45 -0800 Subject: [PATCH 29/56] Remove lastLimited from data distribution Fix a serialization bug in ServerDBInfo, which causes test failures. --- fdbserver/DataDistribution.actor.cpp | 24 ++++++++++++++++++++++- fdbserver/DataDistribution.actor.h | 3 +-- fdbserver/DataDistributionQueue.actor.cpp | 6 +++--- fdbserver/Ratekeeper.actor.cpp | 9 ++++++--- fdbserver/ServerDBInfo.h | 2 +- 5 files changed, 34 insertions(+), 10 deletions(-) diff --git a/fdbserver/DataDistribution.actor.cpp b/fdbserver/DataDistribution.actor.cpp index a46d5d8f01..667e606b35 100644 --- a/fdbserver/DataDistribution.actor.cpp +++ b/fdbserver/DataDistribution.actor.cpp @@ -3478,9 +3478,13 @@ struct DataDistributorData : NonCopyable, ReferenceCounted }; // TODO: remove lastLimited -- obtain this information of ratekeeper from proxy +<<<<<<< HEAD >>>>>>> Fix a segfault bug due to uncopied ratekeeper interface ACTOR Future dataDistribution(Reference self, double* lastLimited) +======= +ACTOR Future dataDistribution(Reference self) +>>>>>>> Remove lastLimited from data distribution { state Database cx = openDBOnServer(self->dbInfo, TaskDataDistributionLaunch, true, true); state DatabaseConfiguration configuration = self->configuration->get(); @@ -3671,7 +3675,7 @@ ACTOR Future dataDistribution(Reference self, actors.push_back( pollMoveKeysLock(cx, lock) ); actors.push_back( reportErrorsExcept( dataDistributionTracker( initData, cx, output, shardsAffectedByTeamFailure, getShardMetrics, getAverageShardBytes.getFuture(), readyToStart, anyZeroHealthyTeams, self->ddId ), "DDTracker", self->ddId, &normalDDQueueErrors() ) ); - actors.push_back( reportErrorsExcept( dataDistributionQueue( cx, output, input.getFuture(), getShardMetrics, processingUnhealthy, tcis, shardsAffectedByTeamFailure, lock, getAverageShardBytes, self->ddId, storageTeamSize, lastLimited ), "DDQueue", self->ddId, &normalDDQueueErrors() ) ); + actors.push_back( reportErrorsExcept( dataDistributionQueue( cx, output, input.getFuture(), getShardMetrics, processingUnhealthy, tcis, shardsAffectedByTeamFailure, lock, getAverageShardBytes, self->ddId, storageTeamSize ), "DDQueue", self->ddId, &normalDDQueueErrors() ) ); vector teamCollectionsPtrs; Reference primaryTeamCollection( new DDTeamCollection(cx, self->ddId, lock, output, shardsAffectedByTeamFailure, configuration, self->primaryDcId, configuration.usableRegions > 1 ? self->remoteDcIds : std::vector>(), readyToStart.getFuture(), zeroHealthyTeams[0], true, processingUnhealthy) ); @@ -3730,10 +3734,28 @@ ACTOR Future dataDistributor(DataDistributorInterface di, Reference distributor = reportErrorsExcept( dataDistribution( self->dbInfo, &lastLimited ), "DataDistribution", di.id(), &normalDataDistributorErrors() ); wait( distributor || collection ); +======= + state Future distributor = reportErrorsExcept( dataDistribution(self), "DataDistribution", di.id(), &normalDataDistributorErrors() ); + + loop choose { + when ( wait( self->configuration->onChange() ) ) { + TraceEvent("DataDistributor_Restart", di.id()) + .detail("Configuration", self->configuration->get().toString()); + self->refreshDcIds(); + distributor = reportErrorsExcept( dataDistribution(self), "DataDistribution", di.id(), &normalDataDistributorErrors() ); + } + when ( wait( collection ) ) { + ASSERT(false); + throw internal_error(); + } + when ( wait( distributor ) ) {} + } +>>>>>>> Remove lastLimited from data distribution } catch ( Error &err ) { if ( normalDataDistributorErrors().count(err.code()) == 0 ) { diff --git a/fdbserver/DataDistribution.actor.h b/fdbserver/DataDistribution.actor.h index 1baff7f58e..109c57878b 100644 --- a/fdbserver/DataDistribution.actor.h +++ b/fdbserver/DataDistribution.actor.h @@ -230,8 +230,7 @@ Future dataDistributionQueue( MoveKeysLock const& lock, PromiseStream> const& getAverageShardBytes, UID const& distributorId, - int const& teamSize, - double* const& lastLimited); + int const& teamSize); //Holds the permitted size and IO Bounds for a shard struct ShardSizeBounds { diff --git a/fdbserver/DataDistributionQueue.actor.cpp b/fdbserver/DataDistributionQueue.actor.cpp index f282ef12bc..f0b8d28d41 100644 --- a/fdbserver/DataDistributionQueue.actor.cpp +++ b/fdbserver/DataDistributionQueue.actor.cpp @@ -1201,10 +1201,10 @@ ACTOR Future dataDistributionQueue( MoveKeysLock lock, PromiseStream> getAverageShardBytes, UID distributorId, - int teamSize, - double* lastLimited) + int teamSize) { - state DDQueueData self( distributorId, lock, cx, teamCollections, shardsAffectedByTeamFailure, getAverageShardBytes, teamSize, output, input, getShardMetrics, lastLimited ); + state double lastLimited = 0; + state DDQueueData self( distributorId, lock, cx, teamCollections, shardsAffectedByTeamFailure, getAverageShardBytes, teamSize, output, input, getShardMetrics, &lastLimited ); state std::set serversToLaunchFrom; state KeyRange keysToLaunchFrom; state RelocateData launchData; diff --git a/fdbserver/Ratekeeper.actor.cpp b/fdbserver/Ratekeeper.actor.cpp index f872a55566..81daee2629 100644 --- a/fdbserver/Ratekeeper.actor.cpp +++ b/fdbserver/Ratekeeper.actor.cpp @@ -648,7 +648,6 @@ ACTOR Future configurationMonitor(Reference> dbInfo ACTOR Future rateKeeper(RatekeeperInterface rkInterf, Reference> dbInfo) { state RatekeeperData self; state Future timeout = Void(); - state std::vector> actors; state std::vector> tlogTrackers; state std::vector tlogInterfs; state Promise err; @@ -676,8 +675,8 @@ ACTOR Future rateKeeper(RatekeeperInterface rkInterf, Reference rateKeeper(RatekeeperInterface rkInterf, Reference void serialize( Ar& ar ) { - serializer(ar, id, clusterInterface, client, distributor, master, resolvers, recoveryCount, recoveryState, masterLifetime, logSystemConfig, priorCommittedLogServers, latencyBandConfig); + serializer(ar, id, clusterInterface, client, distributor, master, ratekeeper, resolvers, recoveryCount, recoveryState, masterLifetime, logSystemConfig, priorCommittedLogServers, latencyBandConfig); } }; From d52ff738c0dba2b4003cd1bb69bd89d62dfa921e Mon Sep 17 00:00:00 2001 From: Jingyu Zhou Date: Tue, 19 Feb 2019 21:05:24 -0800 Subject: [PATCH 30/56] Fix merge conflicts during rebase. --- fdbserver/DataDistribution.actor.cpp | 9 +++++++-- fdbserver/RatekeeperInterface.h | 2 +- fdbserver/WorkerInterface.actor.h | 2 +- 3 files changed, 9 insertions(+), 4 deletions(-) diff --git a/fdbserver/DataDistribution.actor.cpp b/fdbserver/DataDistribution.actor.cpp index 667e606b35..7a18ee8a32 100644 --- a/fdbserver/DataDistribution.actor.cpp +++ b/fdbserver/DataDistribution.actor.cpp @@ -3535,9 +3535,10 @@ ACTOR Future dataDistribution(Reference self) loop { try { loop { - TraceEvent("DDInitTakingMoveKeysLock", myId); - MoveKeysLock lock_ = wait( takeMoveKeysLock( cx, myId ) ); + TraceEvent("DDInitTakingMoveKeysLock", self->ddId); + MoveKeysLock lock_ = wait( takeMoveKeysLock( cx, self->ddId ) ); lock = lock_; +<<<<<<< HEAD TraceEvent("DDInitTookMoveKeysLock", myId); DatabaseConfiguration configuration_ = wait( getDatabaseConfiguration(cx) ); @@ -3584,6 +3585,10 @@ ACTOR Future dataDistribution(Reference self) TraceEvent("DDInitUpdatedReplicaKeys", myId); Reference initData_ = wait( getInitialDataDistribution(cx, myId, lock, configuration.usableRegions > 1 ? remoteDcIds : std::vector>() ) ); +======= + TraceEvent("DDInitTookMoveKeysLock", self->ddId); + Reference initData_ = wait( getInitialDataDistribution(cx, self->ddId, lock, configuration.usableRegions > 1 ? self->remoteDcIds : std::vector>() ) ); +>>>>>>> Fix merge conflicts during rebase. initData = initData_; if(initData->shards.size() > 1) { TraceEvent("DDInitGotInitialDD", self->ddId) diff --git a/fdbserver/RatekeeperInterface.h b/fdbserver/RatekeeperInterface.h index 539aeb8d7f..36a47e167a 100644 --- a/fdbserver/RatekeeperInterface.h +++ b/fdbserver/RatekeeperInterface.h @@ -36,7 +36,7 @@ struct RatekeeperInterface { void initEndpoints() {} UID id() const { return getRateInfo.getEndpoint().token; } - NetworkAddress address() const { return getRateInfo.getEndpoint().address; } + NetworkAddress address() const { return getRateInfo.getEndpoint().getPrimaryAddress(); } bool operator== (const RatekeeperInterface& r) const { return id() == r.id(); } diff --git a/fdbserver/WorkerInterface.actor.h b/fdbserver/WorkerInterface.actor.h index 494ff3fcd3..370d780127 100644 --- a/fdbserver/WorkerInterface.actor.h +++ b/fdbserver/WorkerInterface.actor.h @@ -374,7 +374,7 @@ ACTOR Future resolver(ResolverInterface proxy, InitializeResolverRequest i ACTOR Future logRouter(TLogInterface interf, InitializeLogRouterRequest req, Reference> db); ACTOR Future dataDistributor(DataDistributorInterface ddi, Reference> db); -ACTOR Future rateKeeper(RatekeeperInterface const& rki, Reference> const& db); +ACTOR Future rateKeeper(RatekeeperInterface rki, Reference> db); void registerThreadForProfiling(); void updateCpuProfiler(ProfilerRequest req); From 517966fce219e484d90512108d09fdfeada848eb Mon Sep 17 00:00:00 2001 From: Jingyu Zhou Date: Wed, 20 Feb 2019 15:47:55 -0800 Subject: [PATCH 31/56] Remove lastLimited from rate keeper Refactor code to make IDE happy. --- fdbserver/ClusterController.actor.cpp | 14 ++++++++------ fdbserver/Ratekeeper.actor.cpp | 5 ----- fdbserver/WorkerInterface.actor.h | 4 ++++ 3 files changed, 12 insertions(+), 11 deletions(-) diff --git a/fdbserver/ClusterController.actor.cpp b/fdbserver/ClusterController.actor.cpp index fef69bc7e6..c4b0213eeb 100644 --- a/fdbserver/ClusterController.actor.cpp +++ b/fdbserver/ClusterController.actor.cpp @@ -2336,11 +2336,11 @@ ACTOR Future handleForcedRecoveries( ClusterControllerData *self, ClusterC ACTOR Future startDataDistributor( ClusterControllerData *self ) { state Optional dcId = self->clusterControllerDcId; - state InitializeDataDistributorRequest req; while ( !self->clusterControllerProcessId.present() || !self->masterProcessId.present() ) { wait( delay(SERVER_KNOBS->WAIT_FOR_GOOD_RECRUITMENT_DELAY) ); } + state UID reqId; loop { try { while ( self->db.serverInfo->get().recoveryState < RecoveryState::ACCEPTING_COMMITS ) { @@ -2349,7 +2349,8 @@ ACTOR Future startDataDistributor( ClusterControllerDa std::map>, int> id_used = self->getUsedIds(); state WorkerFitnessInfo data_distributor = self->getWorkerForRoleInDatacenter(dcId, ProcessClass::DataDistributor, ProcessClass::NeverAssign, self->db.config, id_used); - req.reqId = g_random->randomUniqueID(); + reqId = g_random->randomUniqueID(); + state InitializeDataDistributorRequest req(reqId); TraceEvent("ClusterController_DataDistributorRecruit", req.reqId).detail("Addr", data_distributor.worker.first.address()); ErrorOr distributor = wait( data_distributor.worker.first.dataDistributor.getReplyUnlessFailedFor(req, SERVER_KNOBS->WAIT_FOR_DISTRIBUTOR_JOIN_DELAY, 0) ); @@ -2359,7 +2360,7 @@ ACTOR Future startDataDistributor( ClusterControllerDa } } catch (Error& e) { - TraceEvent("ClusterController_DataDistributorRecruitError", req.reqId).error(e); + TraceEvent("ClusterController_DataDistributorRecruitError", reqId).error(e); if ( e.code() != error_code_no_more_servers ) { throw; } @@ -2397,6 +2398,7 @@ ACTOR Future monitorDataDistributor(ClusterControllerData *self) { } ACTOR Future startRatekeeper(ClusterControllerData *self) { + state UID reqId; loop { try { while ( self->db.serverInfo->get().recoveryState < RecoveryState::ACCEPTING_COMMITS ) { @@ -2406,8 +2408,8 @@ ACTOR Future startRatekeeper(ClusterControllerData *self) { std::map>, int> id_used = self->getUsedIds(); Optional dcId = self->clusterControllerDcId; state WorkerFitnessInfo rkWorker = self->getWorkerForRoleInDatacenter(dcId, ProcessClass::RateKeeper, ProcessClass::NeverAssign, self->db.config, id_used); - state InitializeRatekeeperRequest req; - req.reqId = g_random->randomUniqueID(); + reqId = g_random->randomUniqueID(); + state InitializeRatekeeperRequest req(reqId); TraceEvent("ClusterController_RecruitRatekeeper", req.reqId).detail("Addr", rkWorker.worker.first.address()); ErrorOr interf = wait( rkWorker.worker.first.ratekeeper.getReplyUnlessFailedFor(req, SERVER_KNOBS->WAIT_FOR_RATEKEEPER_JOIN_DELAY, 0) ); @@ -2417,7 +2419,7 @@ ACTOR Future startRatekeeper(ClusterControllerData *self) { } } catch (Error& e) { - TraceEvent("ClusterController_RatekeeperRecruitError", req.reqId).error(e); + TraceEvent("ClusterController_RatekeeperRecruitError", reqId).error(e); if ( e.code() != error_code_no_more_servers ) { throw; } diff --git a/fdbserver/Ratekeeper.actor.cpp b/fdbserver/Ratekeeper.actor.cpp index 81daee2629..ec6150a87e 100644 --- a/fdbserver/Ratekeeper.actor.cpp +++ b/fdbserver/Ratekeeper.actor.cpp @@ -160,7 +160,6 @@ struct RatekeeperData { Int64MetricHandle actualTpsMetric; double lastWarning; - double* lastLimited; RatekeeperLimits normalLimits; RatekeeperLimits batchLimits; @@ -653,10 +652,6 @@ ACTOR Future rateKeeper(RatekeeperInterface rkInterf, Reference err; state Future collection = actorCollection( self.addActor.getFuture() ); - // TODOs: - double lastLimited = 0; - self.lastLimited = &lastLimited; - TraceEvent("Ratekeeper_Starting", rkInterf.id()); self.addActor.send( waitFailureServer(rkInterf.waitFailure.getFuture()) ); self.addActor.send( configurationMonitor(dbInfo, &self.configuration) ); diff --git a/fdbserver/WorkerInterface.actor.h b/fdbserver/WorkerInterface.actor.h index 370d780127..f79a96f623 100644 --- a/fdbserver/WorkerInterface.actor.h +++ b/fdbserver/WorkerInterface.actor.h @@ -147,6 +147,8 @@ struct InitializeDataDistributorRequest { UID reqId; ReplyPromise reply; + InitializeDataDistributorRequest() {} + explicit InitializeDataDistributorRequest(UID uid) : reqId(uid) {} template void serialize( Ar& ar ) { serializer(ar, reqId, reply); @@ -157,6 +159,8 @@ struct InitializeRatekeeperRequest { UID reqId; ReplyPromise reply; + InitializeRatekeeperRequest() {} + explicit InitializeRatekeeperRequest(UID uid) : reqId(uid) {} template void serialize(Ar& ar) { serializer(ar, reqId, reply); From 5dcde9efe0a6cb6ce22bd3b90de4507a3e887715 Mon Sep 17 00:00:00 2001 From: Jingyu Zhou Date: Thu, 21 Feb 2019 15:30:39 -0800 Subject: [PATCH 32/56] Fix locality per review comment and a mac compile error --- fdbrpc/Locality.cpp | 66 ++++++++++++++++++--------------------------- 1 file changed, 26 insertions(+), 40 deletions(-) diff --git a/fdbrpc/Locality.cpp b/fdbrpc/Locality.cpp index ff9135d77e..caf3fa1d70 100644 --- a/fdbrpc/Locality.cpp +++ b/fdbrpc/Locality.cpp @@ -164,49 +164,35 @@ ProcessClass::Fitness ProcessClass::machineClassFitness( ClusterRole role ) cons } case ProcessClass::DataDistributor: switch( _class ) { - case ProcessClass::DataDistributorClass: - return ProcessClass::BestFit; - case ProcessClass::StatelessClass: - return ProcessClass::GoodFit; - case ProcessClass::MasterClass: - return ProcessClass::OkayFit; - case ProcessClass::ResolutionClass: - return ProcessClass::OkayFit; - case ProcessClass::TransactionClass: - return ProcessClass::OkayFit; - case ProcessClass::ProxyClass: - return ProcessClass::OkayFit; - case ProcessClass::UnsetClass: - return ProcessClass::UnsetFit; - case ProcessClass::CoordinatorClass: - return ProcessClass::NeverAssign; - case ProcessClass::TesterClass: - return ProcessClass::NeverAssign; - default: - return ProcessClass::WorstFit; + case ProcessClass::DataDistributorClass: + return ProcessClass::BestFit; + case ProcessClass::StatelessClass: + return ProcessClass::GoodFit; + case ProcessClass::MasterClass: + return ProcessClass::OkayFit; + case ProcessClass::UnsetClass: + return ProcessClass::UnsetFit; + case ProcessClass::CoordinatorClass: + case ProcessClass::TesterClass: + return ProcessClass::NeverAssign; + default: + return ProcessClass::WorstFit; } case ProcessClass::RateKeeper: switch( _class ) { - case ProcessClass::RateKeeperClass: - return ProcessClass::BestFit; - case ProcessClass::StatelessClass: - return ProcessClass::GoodFit; - case ProcessClass::MasterClass: - return ProcessClass::OkayFit; - case ProcessClass::ResolutionClass: - return ProcessClass::OkayFit; - case ProcessClass::TransactionClass: - return ProcessClass::OkayFit; - case ProcessClass::ProxyClass: - return ProcessClass::OkayFit; - case ProcessClass::UnsetClass: - return ProcessClass::UnsetFit; - case ProcessClass::CoordinatorClass: - return ProcessClass::NeverAssign; - case ProcessClass::TesterClass: - return ProcessClass::NeverAssign; - default: - return ProcessClass::WorstFit; + case ProcessClass::RateKeeperClass: + return ProcessClass::BestFit; + case ProcessClass::StatelessClass: + return ProcessClass::GoodFit; + case ProcessClass::MasterClass: + return ProcessClass::OkayFit; + case ProcessClass::UnsetClass: + return ProcessClass::UnsetFit; + case ProcessClass::CoordinatorClass: + case ProcessClass::TesterClass: + return ProcessClass::NeverAssign; + default: + return ProcessClass::WorstFit; } default: return ProcessClass::NeverAssign; From 734099826125f3d6081126a6a4d5f4a9bcfb20ea Mon Sep 17 00:00:00 2001 From: Jingyu Zhou Date: Fri, 22 Feb 2019 15:04:38 -0800 Subject: [PATCH 33/56] Fix status message for ratekeeper --- documentation/sphinx/source/mr-status.rst | 1 + fdbclient/Schemas.cpp | 1 + fdbserver/Status.actor.cpp | 21 +++++++++++++++++---- 3 files changed, 19 insertions(+), 4 deletions(-) diff --git a/documentation/sphinx/source/mr-status.rst b/documentation/sphinx/source/mr-status.rst index 61d2103b16..441624f4ad 100644 --- a/documentation/sphinx/source/mr-status.rst +++ b/documentation/sphinx/source/mr-status.rst @@ -339,6 +339,7 @@ cluster.messages log_servers_error Time cluster.messages transaction_start_timeout Unable to start transaction after __ seconds. cluster.messages unreachable_master_worker Unable to locate the master worker. cluster.messages unreachable_dataDistributor_worker Unable to locate the data distributor worker. +cluster.messages unreachable_ratekeeper_worker Unable to locate the ratekeeper worker. cluster.messages unreachable_processes The cluster has some unreachable processes. cluster.messages unreadable_configuration Unable to read database configuration. cluster.messages layer_status_incomplete Some or all of the layers subdocument could not be read. diff --git a/fdbclient/Schemas.cpp b/fdbclient/Schemas.cpp index 769d7e8d25..34b8f1826d 100644 --- a/fdbclient/Schemas.cpp +++ b/fdbclient/Schemas.cpp @@ -324,6 +324,7 @@ const KeyRef JSONSchemas::statusSchema = LiteralStringRef(R"statusSchema( "$enum":[ "unreachable_master_worker", "unreachable_dataDistributor_worker", + "unreachable_ratekeeper_worker", "unreadable_configuration", "full_replication_timeout", "client_issues", diff --git a/fdbserver/Status.actor.cpp b/fdbserver/Status.actor.cpp index 0b48d7fcf2..374d0728e3 100644 --- a/fdbserver/Status.actor.cpp +++ b/fdbserver/Status.actor.cpp @@ -1387,7 +1387,7 @@ JsonBuilderObject getPerfLimit(TraceEventFields const& ratekeeper, double transP return perfLimit; } -ACTOR static Future workloadStatusFetcher(Reference> db, vector> workers, std::pair mWorker, std::pair ddWorker, +ACTOR static Future workloadStatusFetcher(Reference> db, vector> workers, std::pair mWorker, std::pair rkWorker, JsonBuilderObject *qos, JsonBuilderObject *data_overlay, std::set *incomplete_reasons, Future>>> storageServerFuture) { state JsonBuilderObject statusObj; @@ -1439,8 +1439,8 @@ ACTOR static Future workloadStatusFetcher(Reference clusterGetStatus( state std::set status_incomplete_reasons; state std::pair mWorker; state std::pair ddWorker; // DataDistributor worker + state std::pair rkWorker; // RateKeeper worker try { // Get the master Worker interface @@ -1837,6 +1838,18 @@ ACTOR Future clusterGetStatus( ddWorker = _ddWorker.get(); } + // Get the RateKeeper worker interface + Optional> _rkWorker; + if (db->get().ratekeeper.present()) { + _rkWorker = getWorker( workers, db->get().ratekeeper.get().address() ); + } + + if (!db->get().ratekeeper.present() || !_rkWorker.present()) { + messages.push_back(JsonString::makeMessage("unreachable_ratekeeper_worker", "Unable to locate the ratekeeper worker.")); + } else { + rkWorker = _rkWorker.get(); + } + // Get latest events for various event types from ALL workers // WorkerEvents is a map of worker's NetworkAddress to its event string // The pair represents worker responses and a set of worker NetworkAddress strings which did not respond @@ -1940,7 +1953,7 @@ ACTOR Future clusterGetStatus( state int minReplicasRemaining = -1; std::vector> futures2; futures2.push_back(dataStatusFetcher(ddWorker, &minReplicasRemaining)); - futures2.push_back(workloadStatusFetcher(db, workers, mWorker, ddWorker, &qos, &data_overlay, &status_incomplete_reasons, storageServerFuture)); + futures2.push_back(workloadStatusFetcher(db, workers, mWorker, rkWorker, &qos, &data_overlay, &status_incomplete_reasons, storageServerFuture)); futures2.push_back(layerStatusFetcher(cx, &messages, &status_incomplete_reasons)); futures2.push_back(lockedStatusFetcher(db, &messages, &status_incomplete_reasons)); From 835cc278c33819739520d2ce35cd7c84ce2d9db2 Mon Sep 17 00:00:00 2001 From: Jingyu Zhou Date: Fri, 22 Feb 2019 16:36:07 -0800 Subject: [PATCH 34/56] Fix rebase conflicts. --- fdbserver/DataDistribution.actor.cpp | 115 +++------------------------ 1 file changed, 9 insertions(+), 106 deletions(-) diff --git a/fdbserver/DataDistribution.actor.cpp b/fdbserver/DataDistribution.actor.cpp index 7a18ee8a32..d664e07249 100644 --- a/fdbserver/DataDistribution.actor.cpp +++ b/fdbserver/DataDistribution.actor.cpp @@ -3447,82 +3447,19 @@ ACTOR Future pollMoveKeysLock( Database cx, MoveKeysLock lock ) { } } -<<<<<<< HEAD -======= struct DataDistributorData : NonCopyable, ReferenceCounted { Reference> dbInfo; - Reference> configuration; - std::vector> primaryDcId; - std::vector> remoteDcIds; UID ddId; PromiseStream> addActor; - DataDistributorData(Reference> const& db, Reference> const& dbConfig, UID id) - : dbInfo(db), configuration(dbConfig), ddId(id) {} - - void refreshDcIds() { - primaryDcId.clear(); - remoteDcIds.clear(); - - const std::vector& regions = configuration->get().regions; - TraceEvent ev("DataDistributor", ddId); - if ( regions.size() > 0 ) { - primaryDcId.push_back( regions[0].dcId ); - ev.detail("PrimaryDcID", regions[0].dcId.toHexString()); - } - if ( regions.size() > 1 ) { - remoteDcIds.push_back( regions[1].dcId ); - ev.detail("SecondaryDcID", regions[1].dcId.toHexString()); - } - } + DataDistributorData(Reference> const& db, UID id) : dbInfo(db), ddId(id) {} }; -// TODO: remove lastLimited -- obtain this information of ratekeeper from proxy -<<<<<<< HEAD ->>>>>>> Fix a segfault bug due to uncopied ratekeeper interface -ACTOR Future dataDistribution(Reference self, - double* lastLimited) -======= ACTOR Future dataDistribution(Reference self) ->>>>>>> Remove lastLimited from data distribution { state Database cx = openDBOnServer(self->dbInfo, TaskDataDistributionLaunch, true, true); - state DatabaseConfiguration configuration = self->configuration->get(); cx->locationCacheSize = SERVER_KNOBS->DD_LOCATION_CACHE_SIZE; -<<<<<<< HEAD -======= - state Transaction tr(cx); - loop { - try { - tr.setOption( FDBTransactionOptions::ACCESS_SYSTEM_KEYS ); - tr.setOption( FDBTransactionOptions::PRIORITY_SYSTEM_IMMEDIATE ); - - Standalone replicaKeys = wait(tr.getRange(datacenterReplicasKeys, CLIENT_KNOBS->TOO_MANY)); - - for(auto& kv : replicaKeys) { - auto dcId = decodeDatacenterReplicasKey(kv.key); - auto replicas = decodeDatacenterReplicasValue(kv.value); - if ((self->primaryDcId.size() && self->primaryDcId[0] == dcId) || - (self->remoteDcIds.size() && self->remoteDcIds[0] == dcId && configuration.usableRegions > 1)) { - if(replicas > configuration.storageTeamSize) { - tr.set(kv.key, datacenterReplicasValue(configuration.storageTeamSize)); - } - } else { - tr.clear(kv.key); - } - } - - wait(tr.commit()); - break; - } - catch(Error &e) { - wait(tr.onError(e)); - } - } - - ->>>>>>> Minor fix on ratekeeper work registration. //cx->setOption( FDBDatabaseOptions::LOCATION_CACHE_SIZE, StringRef((uint8_t*) &SERVER_KNOBS->DD_LOCATION_CACHE_SIZE, 8) ); //ASSERT( cx->locationCacheSize == SERVER_KNOBS->DD_LOCATION_CACHE_SIZE ); @@ -3538,8 +3475,7 @@ ACTOR Future dataDistribution(Reference self) TraceEvent("DDInitTakingMoveKeysLock", self->ddId); MoveKeysLock lock_ = wait( takeMoveKeysLock( cx, self->ddId ) ); lock = lock_; -<<<<<<< HEAD - TraceEvent("DDInitTookMoveKeysLock", myId); + TraceEvent("DDInitTookMoveKeysLock", self->ddId); DatabaseConfiguration configuration_ = wait( getDatabaseConfiguration(cx) ); configuration = configuration_; @@ -3553,7 +3489,7 @@ ACTOR Future dataDistribution(Reference self) remoteDcIds.push_back( regions[1].dcId ); } - TraceEvent("DDInitGotConfiguration", myId).detail("Conf", configuration.toString()); + TraceEvent("DDInitGotConfiguration", self->ddId).detail("Conf", configuration.toString()); state Transaction tr(cx); loop { @@ -3583,12 +3519,8 @@ ACTOR Future dataDistribution(Reference self) } } - TraceEvent("DDInitUpdatedReplicaKeys", myId); - Reference initData_ = wait( getInitialDataDistribution(cx, myId, lock, configuration.usableRegions > 1 ? remoteDcIds : std::vector>() ) ); -======= - TraceEvent("DDInitTookMoveKeysLock", self->ddId); - Reference initData_ = wait( getInitialDataDistribution(cx, self->ddId, lock, configuration.usableRegions > 1 ? self->remoteDcIds : std::vector>() ) ); ->>>>>>> Fix merge conflicts during rebase. + TraceEvent("DDInitUpdatedReplicaKeys", self->ddId); + Reference initData_ = wait( getInitialDataDistribution(cx, self->ddId, lock, configuration.usableRegions > 1 ? remoteDcIds : std::vector>() ) ); initData = initData_; if(initData->shards.size() > 1) { TraceEvent("DDInitGotInitialDD", self->ddId) @@ -3683,10 +3615,10 @@ ACTOR Future dataDistribution(Reference self) actors.push_back( reportErrorsExcept( dataDistributionQueue( cx, output, input.getFuture(), getShardMetrics, processingUnhealthy, tcis, shardsAffectedByTeamFailure, lock, getAverageShardBytes, self->ddId, storageTeamSize ), "DDQueue", self->ddId, &normalDDQueueErrors() ) ); vector teamCollectionsPtrs; - Reference primaryTeamCollection( new DDTeamCollection(cx, self->ddId, lock, output, shardsAffectedByTeamFailure, configuration, self->primaryDcId, configuration.usableRegions > 1 ? self->remoteDcIds : std::vector>(), readyToStart.getFuture(), zeroHealthyTeams[0], true, processingUnhealthy) ); + Reference primaryTeamCollection( new DDTeamCollection(cx, self->ddId, lock, output, shardsAffectedByTeamFailure, configuration, primaryDcId, configuration.usableRegions > 1 ? remoteDcIds : std::vector>(), readyToStart.getFuture(), zeroHealthyTeams[0], true, processingUnhealthy) ); teamCollectionsPtrs.push_back(primaryTeamCollection.getPtr()); if (configuration.usableRegions > 1) { - Reference remoteTeamCollection( new DDTeamCollection(cx, self->ddId, lock, output, shardsAffectedByTeamFailure, configuration, self->remoteDcIds, Optional>>(), readyToStart.getFuture() && remoteRecovered(self->dbInfo), zeroHealthyTeams[1], false, processingUnhealthy) ); + Reference remoteTeamCollection( new DDTeamCollection(cx, self->ddId, lock, output, shardsAffectedByTeamFailure, configuration, remoteDcIds, Optional>>(), readyToStart.getFuture() && remoteRecovered(self->dbInfo), zeroHealthyTeams[1], false, processingUnhealthy) ); teamCollectionsPtrs.push_back(remoteTeamCollection.getPtr()); remoteTeamCollection->teamCollections = teamCollectionsPtrs; actors.push_back( reportErrorsExcept( dataDistributionTeamCollection( remoteTeamCollection, initData, tcis[1], self->dbInfo ), "DDTeamCollectionSecondary", self->ddId, &normalDDQueueErrors() ) ); @@ -3710,14 +3642,6 @@ ACTOR Future dataDistribution(Reference self) } } -struct DataDistributorData : NonCopyable, ReferenceCounted { - Reference> dbInfo; - UID ddId; - PromiseStream> addActor; - - DataDistributorData(Reference> const& db, UID id) : dbInfo(db), ddId(id) {} -}; - static std::set const& normalDataDistributorErrors() { static std::set s; if (s.empty()) { @@ -3734,33 +3658,12 @@ ACTOR Future dataDistributor(DataDistributorInterface di, Reference self( new DataDistributorData(db, di.id()) ); state Future collection = actorCollection( self->addActor.getFuture() ); - TraceEvent("DataDistributor_Starting", di.id()); - self->addActor.send( waitFailureServer(di.waitFailure.getFuture()) ); - try { TraceEvent("DataDistributor_Running", di.id()); -<<<<<<< HEAD - state double lastLimited = 0; - state Future distributor = reportErrorsExcept( dataDistribution( self->dbInfo, &lastLimited ), "DataDistribution", di.id(), &normalDataDistributorErrors() ); - - wait( distributor || collection ); -======= + self->addActor.send( waitFailureServer(di.waitFailure.getFuture()) ); state Future distributor = reportErrorsExcept( dataDistribution(self), "DataDistribution", di.id(), &normalDataDistributorErrors() ); - loop choose { - when ( wait( self->configuration->onChange() ) ) { - TraceEvent("DataDistributor_Restart", di.id()) - .detail("Configuration", self->configuration->get().toString()); - self->refreshDcIds(); - distributor = reportErrorsExcept( dataDistribution(self), "DataDistribution", di.id(), &normalDataDistributorErrors() ); - } - when ( wait( collection ) ) { - ASSERT(false); - throw internal_error(); - } - when ( wait( distributor ) ) {} - } ->>>>>>> Remove lastLimited from data distribution + wait( distributor || collection ); } catch ( Error &err ) { if ( normalDataDistributorErrors().count(err.code()) == 0 ) { From dc129207a92098126260c627c57704d02b1d6a3a Mon Sep 17 00:00:00 2001 From: Jingyu Zhou Date: Wed, 27 Feb 2019 11:51:48 -0800 Subject: [PATCH 35/56] Minor fix after rebase. --- fdbserver/DataDistributorInterface.h | 1 - fdbserver/MasterProxyServer.actor.cpp | 12 ------------ fdbserver/Ratekeeper.actor.cpp | 7 +------ fdbserver/RatekeeperInterface.h | 1 - 4 files changed, 1 insertion(+), 20 deletions(-) diff --git a/fdbserver/DataDistributorInterface.h b/fdbserver/DataDistributorInterface.h index d437fc69ae..4c2f68f83d 100644 --- a/fdbserver/DataDistributorInterface.h +++ b/fdbserver/DataDistributorInterface.h @@ -21,7 +21,6 @@ #ifndef FDBSERVER_DATADISTRIBUTORINTERFACE_H #define FDBSERVER_DATADISTRIBUTORINTERFACE_H -#include "fdbclient/FDBTypes.h" #include "fdbrpc/fdbrpc.h" #include "fdbrpc/Locality.h" diff --git a/fdbserver/MasterProxyServer.actor.cpp b/fdbserver/MasterProxyServer.actor.cpp index 2b3299572e..4cbe878eb6 100644 --- a/fdbserver/MasterProxyServer.actor.cpp +++ b/fdbserver/MasterProxyServer.actor.cpp @@ -76,17 +76,6 @@ struct ProxyStats { } }; -ACTOR template -Future forwardValue(Promise out, Future in) -{ - // Like forwardPromise, but throws on error - T t = wait(in); - out.send(t); - return Void(); -} - -int getBytes(Promise const& r) { return 0; } - ACTOR Future getRate(UID myID, Reference> db, int64_t* inTransactionCount, int64_t* inBatchTransactionCount, double* outTransactionRate, double* outBatchTransactionRate, GetHealthMetricsReply* healthMetricsReply, GetHealthMetricsReply* detailedHealthMetricsReply) { state Future nextRequestTimer = Never(); @@ -94,7 +83,6 @@ ACTOR Future getRate(UID myID, Reference> db, int64 state Future reply = Never(); state double lastDetailedReply = 0.0; // request detailed metrics immediately state bool expectingDetailedReply = false; - state int64_t lastTC = 0; if (db->get().ratekeeper.present()) nextRequestTimer = Void(); diff --git a/fdbserver/Ratekeeper.actor.cpp b/fdbserver/Ratekeeper.actor.cpp index ec6150a87e..00ea50e850 100644 --- a/fdbserver/Ratekeeper.actor.cpp +++ b/fdbserver/Ratekeeper.actor.cpp @@ -24,7 +24,7 @@ #include "fdbrpc/simulator.h" #include "fdbclient/ReadYourWrites.h" #include "fdbserver/Knobs.h" -#include "fdbserver/DataDistribution.h" +#include "fdbserver/DataDistribution.actor.h" #include "fdbserver/ServerDBInfo.h" #include "fdbserver/WaitFailure.h" #include "flow/actorcompiler.h" // This must be the last #include. @@ -676,11 +676,6 @@ ACTOR Future rateKeeper(RatekeeperInterface rkInterf, Reference SERVER_KNOBS->LAST_LIMITED_RATIO * self.batchLimits.tpsLimit) { - *self.lastLimited = now(); - } - - double tooOld = now() - 1.0; for(auto p=self.proxy_transactionCounts.begin(); p!=self.proxy_transactionCounts.end(); ) { if (p->second.time < tooOld) diff --git a/fdbserver/RatekeeperInterface.h b/fdbserver/RatekeeperInterface.h index 36a47e167a..c50447d544 100644 --- a/fdbserver/RatekeeperInterface.h +++ b/fdbserver/RatekeeperInterface.h @@ -21,7 +21,6 @@ #ifndef FDBSERVER_RATEKEEPERINTERFACE_H #define FDBSERVER_RATEKEEPERINTERFACE_H -#include "fdbclient/StorageServerInterface.h" #include "fdbclient/FDBTypes.h" #include "fdbrpc/fdbrpc.h" #include "fdbrpc/Locality.h" From f43277e8192ee71c62a0492f55e6002d96b6106b Mon Sep 17 00:00:00 2001 From: Jingyu Zhou Date: Wed, 6 Mar 2019 10:46:17 -0800 Subject: [PATCH 36/56] Format Ratekeeper.actor.cpp code --- fdbserver/Ratekeeper.actor.cpp | 82 +++++++++++++++++----------------- 1 file changed, 41 insertions(+), 41 deletions(-) diff --git a/fdbserver/Ratekeeper.actor.cpp b/fdbserver/Ratekeeper.actor.cpp index 00ea50e850..a0ff3ffd5e 100644 --- a/fdbserver/Ratekeeper.actor.cpp +++ b/fdbserver/Ratekeeper.actor.cpp @@ -310,12 +310,12 @@ ACTOR Future monitorServerListChange( serverListAndProcessClasses = Never(); std::map newServers; - for( int i = 0; i < results.size(); i++ ) { - UID serverId = results[i].first.id(); - StorageServerInterface const& ssi = results[i].first; + for (int i = 0; i < results.size(); i++) { + const StorageServerInterface& ssi = results[i].first; + const UID serverId = ssi.id(); newServers[serverId] = ssi; - if ( oldServers.count( serverId ) ) { + if (oldServers.count(serverId)) { if (ssi.getValue.getEndpoint() != oldServers[serverId].getValue.getEndpoint()) { serverChanges.send( std::make_pair(serverId, Optional(ssi)) ); } @@ -325,7 +325,7 @@ ACTOR Future monitorServerListChange( } } - for (auto it : oldServers) { + for (const auto& it : oldServers) { serverChanges.send( std::make_pair(it.first, Optional()) ); } @@ -342,7 +342,7 @@ ACTOR Future monitorServerListChange( } } -void updateRate( RatekeeperData* self, RatekeeperLimits &limits ) { +void updateRate(RatekeeperData* self, RatekeeperLimits* limits) { //double controlFactor = ; // dt / eFoldingTime double actualTps = self->smoothReleasedTransactions.smoothRate(); @@ -350,7 +350,7 @@ void updateRate( RatekeeperData* self, RatekeeperLimits &limits ) { // SOMEDAY: Remove the max( 1.0, ... ) since the below calculations _should_ be able to recover back up from this value actualTps = std::max( std::max( 1.0, actualTps ), self->smoothTotalDurableBytes.smoothRate() / CLIENT_KNOBS->TRANSACTION_SIZE_LIMIT ); - limits.tpsLimit = std::numeric_limits::infinity(); + limits->tpsLimit = std::numeric_limits::infinity(); UID reasonID = UID(); limitReason_t limitReason = limitReason_t::unlimited; @@ -376,9 +376,9 @@ void updateRate( RatekeeperData* self, RatekeeperLimits &limits ) { worstFreeSpaceStorageServer = std::min(worstFreeSpaceStorageServer, (int64_t)ss.smoothFreeSpace.smoothTotal() - minFreeSpace); - int64_t springBytes = std::max(1, std::min(limits.storageSpringBytes, (ss.smoothFreeSpace.smoothTotal() - minFreeSpace) * 0.2)); - int64_t targetBytes = std::max(1, std::min(limits.storageTargetBytes, (int64_t)ss.smoothFreeSpace.smoothTotal() - minFreeSpace)); - if (targetBytes != limits.storageTargetBytes) { + int64_t springBytes = std::max(1, std::min(limits->storageSpringBytes, (ss.smoothFreeSpace.smoothTotal() - minFreeSpace) * 0.2)); + int64_t targetBytes = std::max(1, std::min(limits->storageTargetBytes, (int64_t)ss.smoothFreeSpace.smoothTotal() - minFreeSpace)); + if (targetBytes != limits->storageTargetBytes) { if (minFreeSpace == SERVER_KNOBS->MIN_FREE_SPACE) { ssLimitReason = limitReason_t::storage_server_min_free_space; } else { @@ -442,9 +442,9 @@ void updateRate( RatekeeperData* self, RatekeeperLimits &limits ) { storageTpsLimitReverseIndex.insert(std::make_pair(limitTps, &ss)); - if(limitTps < limits.tpsLimit && (ssLimitReason == limitReason_t::storage_server_min_free_space || ssLimitReason == limitReason_t::storage_server_min_free_space_ratio)) { + if (limitTps < limits->tpsLimit && (ssLimitReason == limitReason_t::storage_server_min_free_space || ssLimitReason == limitReason_t::storage_server_min_free_space_ratio)) { reasonID = ss.id; - limits.tpsLimit = limitTps; + limits->tpsLimit = limitTps; limitReason = ssLimitReason; } @@ -455,19 +455,19 @@ void updateRate( RatekeeperData* self, RatekeeperLimits &limits ) { self->healthMetrics.worstStorageDurabilityLag = worstStorageDurabilityLagStorageServer; std::set>> ignoredMachines; - for(auto ss = storageTpsLimitReverseIndex.begin(); ss != storageTpsLimitReverseIndex.end() && ss->first < limits.tpsLimit; ++ss) { - if(ignoredMachines.size() < std::min(self->configuration.storageTeamSize - 1, SERVER_KNOBS->MAX_MACHINES_FALLING_BEHIND)) { + for (auto ss = storageTpsLimitReverseIndex.begin(); ss != storageTpsLimitReverseIndex.end() && ss->first < limits->tpsLimit; ++ss) { + if (ignoredMachines.size() < std::min(self->configuration.storageTeamSize - 1, SERVER_KNOBS->MAX_MACHINES_FALLING_BEHIND)) { ignoredMachines.insert(ss->second->locality.zoneId()); continue; } - if(ignoredMachines.count(ss->second->locality.zoneId()) > 0) { + if (ignoredMachines.count(ss->second->locality.zoneId()) > 0) { continue; } limitingStorageQueueStorageServer = ss->second->lastReply.bytesInput - ss->second->smoothDurableBytes.smoothTotal(); - limits.tpsLimit = ss->first; - limitReason = ssReasons[storageTpsLimitReverseIndex.begin()->second->id]; + limits->tpsLimit = ss->first; reasonID = storageTpsLimitReverseIndex.begin()->second->id; // Although we aren't controlling based on the worst SS, we still report it as the limiting process + limitReason = ssReasons[reasonID]; break; } @@ -479,27 +479,27 @@ void updateRate( RatekeeperData* self, RatekeeperLimits &limits ) { { Version minSSVer = std::numeric_limits::max(); Version minLimitingSSVer = std::numeric_limits::max(); - for(auto i = self->storageQueueInfo.begin(); i != self->storageQueueInfo.end(); ++i) { - auto& ss = i->value; + for (const auto& it : self->storageQueueInfo) { + auto& ss = it.value; if (!ss.valid) continue; minSSVer = std::min(minSSVer, ss.lastReply.version); // Machines that ratekeeper isn't controlling can fall arbitrarily far behind - if(ignoredMachines.count(i->value.locality.zoneId()) == 0) { + if (ignoredMachines.count(it.value.locality.zoneId()) == 0) { minLimitingSSVer = std::min(minLimitingSSVer, ss.lastReply.version); } } Version maxTLVer = std::numeric_limits::min(); - for(auto i = self->tlogQueueInfo.begin(); i != self->tlogQueueInfo.end(); ++i) { - auto& tl = i->value; + for(const auto& it : self->tlogQueueInfo) { + auto& tl = it.value; if (!tl.valid) continue; maxTLVer = std::max(maxTLVer, tl.lastReply.v); } // writeToReadLatencyLimit: 0 = infinte speed; 1 = TL durable speed ; 2 = half TL durable speed - writeToReadLatencyLimit = ((maxTLVer - minLimitingSSVer) - limits.maxVersionDifference/2) / (limits.maxVersionDifference/4); + writeToReadLatencyLimit = ((maxTLVer - minLimitingSSVer) - limits->maxVersionDifference/2) / (limits->maxVersionDifference/4); worstVersionLag = std::max((Version)0, maxTLVer - minSSVer); limitingVersionLag = std::max((Version)0, maxTLVer - minLimitingSSVer); } @@ -507,8 +507,8 @@ void updateRate( RatekeeperData* self, RatekeeperLimits &limits ) { int64_t worstFreeSpaceTLog = std::numeric_limits::max(); int64_t worstStorageQueueTLog = 0; int tlcount = 0; - for(auto i = self->tlogQueueInfo.begin(); i != self->tlogQueueInfo.end(); ++i) { - auto& tl = i->value; + for (auto& it : self->tlogQueueInfo) { + auto& tl = it.value; if (!tl.valid) continue; ++tlcount; @@ -518,9 +518,9 @@ void updateRate( RatekeeperData* self, RatekeeperLimits &limits ) { worstFreeSpaceTLog = std::min(worstFreeSpaceTLog, (int64_t)tl.smoothFreeSpace.smoothTotal() - minFreeSpace); - int64_t springBytes = std::max(1, std::min(limits.logSpringBytes, (tl.smoothFreeSpace.smoothTotal() - minFreeSpace) * 0.2)); - int64_t targetBytes = std::max(1, std::min(limits.logTargetBytes, (int64_t)tl.smoothFreeSpace.smoothTotal() - minFreeSpace)); - if (targetBytes != limits.logTargetBytes) { + int64_t springBytes = std::max(1, std::min(limits->logSpringBytes, (tl.smoothFreeSpace.smoothTotal() - minFreeSpace) * 0.2)); + int64_t targetBytes = std::max(1, std::min(limits->logTargetBytes, (int64_t)tl.smoothFreeSpace.smoothTotal() - minFreeSpace)); + if (targetBytes != limits->logTargetBytes) { if (minFreeSpace == SERVER_KNOBS->MIN_FREE_SPACE) { tlogLimitReason = limitReason_t::log_server_min_free_space; } else { @@ -540,7 +540,7 @@ void updateRate( RatekeeperData* self, RatekeeperLimits &limits ) { } reasonID = tl.id; limitReason = limitReason_t::log_server_min_free_space; - limits.tpsLimit = 0.0; + limits->tpsLimit = 0.0; } double targetRateRatio = std::min( ( b + springBytes ) / (double)springBytes, 2.0 ); @@ -558,8 +558,8 @@ void updateRate( RatekeeperData* self, RatekeeperLimits &limits ) { if (targetRateRatio < .75) //< FIXME: KNOB for 2.0 x = std::max(x, 0.95); double lim = actualTps * x; - if (lim < limits.tpsLimit){ - limits.tpsLimit = lim; + if (lim < limits->tpsLimit){ + limits->tpsLimit = lim; reasonID = tl.id; limitReason = tlogLimitReason; } @@ -568,8 +568,8 @@ void updateRate( RatekeeperData* self, RatekeeperLimits &limits ) { // Don't let any tlogs use up its target bytes faster than its MVCC window! double x = ((targetBytes - springBytes) / ((((double)SERVER_KNOBS->MAX_READ_TRANSACTION_LIFE_VERSIONS)/SERVER_KNOBS->VERSIONS_PER_SECOND) + 2.0)) / inputRate; double lim = actualTps * x; - if (lim < limits.tpsLimit){ - limits.tpsLimit = lim; + if (lim < limits->tpsLimit){ + limits->tpsLimit = lim; reasonID = tl.id; limitReason = limitReason_t::log_server_mvcc_write_bandwidth; } @@ -578,10 +578,10 @@ void updateRate( RatekeeperData* self, RatekeeperLimits &limits ) { self->healthMetrics.worstTLogQueue = worstStorageQueueTLog; - limits.tpsLimit = std::max(limits.tpsLimit, 0.0); + limits->tpsLimit = std::max(limits->tpsLimit, 0.0); if(g_network->isSimulated() && g_simulator.speedUpSimulation) { - limits.tpsLimit = std::max(limits.tpsLimit, 100.0); + limits->tpsLimit = std::max(limits->tpsLimit, 100.0); } int64_t totalDiskUsageBytes = 0; @@ -592,13 +592,13 @@ void updateRate( RatekeeperData* self, RatekeeperLimits &limits ) { if (s.value.valid) totalDiskUsageBytes += s.value.lastReply.storageBytes.used; - limits.tpsLimitMetric = std::min(limits.tpsLimit, 1e6); - limits.reasonMetric = limitReason; + limits->tpsLimitMetric = std::min(limits->tpsLimit, 1e6); + limits->reasonMetric = limitReason; if (g_random->random01() < 0.1) { - std::string name = "RkUpdate" + limits.context; + std::string name = "RkUpdate" + limits->context; TraceEvent(name.c_str()) - .detail("TPSLimit", limits.tpsLimit) + .detail("TPSLimit", limits->tpsLimit) .detail("Reason", limitReason) .detail("ReasonServerID", reasonID) .detail("ReleasedTPS", self->smoothReleasedTransactions.smoothRate()) @@ -673,8 +673,8 @@ ACTOR Future rateKeeper(RatekeeperInterface rkInterf, Reference Date: Thu, 7 Mar 2019 10:15:28 -0800 Subject: [PATCH 37/56] Data distributor pulls batch limited info from proxy Add a flag in HealthMetrics to indicate that batch priority is rate limited. Data distributor pulls this flag from proxy to know roughly when rate limiting happens. DD uses this information to determine when to do the rebalance in the background, i.e., moving data from heavily loaded servers to lighter ones. If the cluster is currently rate limited for batch commits, then the rebalance will use longer time intervals, otherwise use shorter intervals. See BgDDMountainChopper() and BgDDValleyFiller() in DataDistributionQueue.actor.cpp. --- fdbclient/FDBTypes.h | 8 +++++-- fdbserver/DataDistribution.actor.cpp | 28 ++++++++++++++++++++--- fdbserver/DataDistribution.actor.h | 3 ++- fdbserver/DataDistributionQueue.actor.cpp | 6 ++--- fdbserver/Ratekeeper.actor.cpp | 3 +++ 5 files changed, 39 insertions(+), 9 deletions(-) diff --git a/fdbclient/FDBTypes.h b/fdbclient/FDBTypes.h index 5b7de5e818..766d39831c 100644 --- a/fdbclient/FDBTypes.h +++ b/fdbclient/FDBTypes.h @@ -737,6 +737,7 @@ struct HealthMetrics { int64_t worstStorageDurabilityLag; int64_t worstTLogQueue; double tpsLimit; + bool batchLimited; std::map storageStats; std::map tLogQueue; @@ -745,6 +746,7 @@ struct HealthMetrics { , worstStorageDurabilityLag(0) , worstTLogQueue(0) , tpsLimit(0.0) + , batchLimited(false) {} void update(const HealthMetrics& hm, bool detailedInput, bool detailedOutput) @@ -753,6 +755,7 @@ struct HealthMetrics { worstStorageDurabilityLag = hm.worstStorageDurabilityLag; worstTLogQueue = hm.worstTLogQueue; tpsLimit = hm.tpsLimit; + batchLimited = hm.batchLimited; if (!detailedOutput) { storageStats.clear(); @@ -769,13 +772,14 @@ struct HealthMetrics { worstStorageDurabilityLag == r.worstStorageDurabilityLag && worstTLogQueue == r.worstTLogQueue && storageStats == r.storageStats && - tLogQueue == r.tLogQueue + tLogQueue == r.tLogQueue && + batchLimited == r.batchLimited ); } template void serialize(Ar& ar) { - serializer(ar, worstStorageQueue, worstStorageDurabilityLag, worstTLogQueue, tpsLimit, storageStats, tLogQueue); + serializer(ar, worstStorageQueue, worstStorageDurabilityLag, worstTLogQueue, tpsLimit, batchLimited, storageStats, tLogQueue); } }; diff --git a/fdbserver/DataDistribution.actor.cpp b/fdbserver/DataDistribution.actor.cpp index d664e07249..e6af98ce49 100644 --- a/fdbserver/DataDistribution.actor.cpp +++ b/fdbserver/DataDistribution.actor.cpp @@ -3455,7 +3455,7 @@ struct DataDistributorData : NonCopyable, ReferenceCounted DataDistributorData(Reference> const& db, UID id) : dbInfo(db), ddId(id) {} }; -ACTOR Future dataDistribution(Reference self) +ACTOR Future dataDistribution(Reference self, double* lastLimited) { state Database cx = openDBOnServer(self->dbInfo, TaskDataDistributionLaunch, true, true); cx->locationCacheSize = SERVER_KNOBS->DD_LOCATION_CACHE_SIZE; @@ -3612,7 +3612,7 @@ ACTOR Future dataDistribution(Reference self) actors.push_back( pollMoveKeysLock(cx, lock) ); actors.push_back( reportErrorsExcept( dataDistributionTracker( initData, cx, output, shardsAffectedByTeamFailure, getShardMetrics, getAverageShardBytes.getFuture(), readyToStart, anyZeroHealthyTeams, self->ddId ), "DDTracker", self->ddId, &normalDDQueueErrors() ) ); - actors.push_back( reportErrorsExcept( dataDistributionQueue( cx, output, input.getFuture(), getShardMetrics, processingUnhealthy, tcis, shardsAffectedByTeamFailure, lock, getAverageShardBytes, self->ddId, storageTeamSize ), "DDQueue", self->ddId, &normalDDQueueErrors() ) ); + actors.push_back( reportErrorsExcept( dataDistributionQueue( cx, output, input.getFuture(), getShardMetrics, processingUnhealthy, tcis, shardsAffectedByTeamFailure, lock, getAverageShardBytes, self->ddId, storageTeamSize, lastLimited ), "DDQueue", self->ddId, &normalDDQueueErrors() ) ); vector teamCollectionsPtrs; Reference primaryTeamCollection( new DDTeamCollection(cx, self->ddId, lock, output, shardsAffectedByTeamFailure, configuration, primaryDcId, configuration.usableRegions > 1 ? remoteDcIds : std::vector>(), readyToStart.getFuture(), zeroHealthyTeams[0], true, processingUnhealthy) ); @@ -3654,14 +3654,36 @@ static std::set const& normalDataDistributorErrors() { return s; } +ACTOR Future monitorBatchLimitedTime(Reference> db, double* lastLimited) { + loop { + wait( delay(SERVER_KNOBS->METRIC_UPDATE_RATE) ); + while (db->get().client.proxies.size() == 0) { + wait(db->onChange()); + } + + state int idx = g_random->randomInt(0, db->get().client.proxies.size()); + choose { + when (wait(db->onChange())) {} + when (ErrorOr reply = wait( + db->get().client.proxies[idx].getHealthMetrics.getReplyUnlessFailedFor(GetHealthMetricsRequest(false), 1.0, 0))) { + if (reply.present() && reply.get().healthMetrics.batchLimited) { + *lastLimited = now(); + } + } + } + } +} + ACTOR Future dataDistributor(DataDistributorInterface di, Reference> db ) { state Reference self( new DataDistributorData(db, di.id()) ); state Future collection = actorCollection( self->addActor.getFuture() ); + state double lastLimited = 0; try { TraceEvent("DataDistributor_Running", di.id()); self->addActor.send( waitFailureServer(di.waitFailure.getFuture()) ); - state Future distributor = reportErrorsExcept( dataDistribution(self), "DataDistribution", di.id(), &normalDataDistributorErrors() ); + self->addActor.send( monitorBatchLimitedTime(db, &lastLimited) ); + state Future distributor = reportErrorsExcept( dataDistribution(self, &lastLimited), "DataDistribution", di.id(), &normalDataDistributorErrors() ); wait( distributor || collection ); } diff --git a/fdbserver/DataDistribution.actor.h b/fdbserver/DataDistribution.actor.h index 109c57878b..1baff7f58e 100644 --- a/fdbserver/DataDistribution.actor.h +++ b/fdbserver/DataDistribution.actor.h @@ -230,7 +230,8 @@ Future dataDistributionQueue( MoveKeysLock const& lock, PromiseStream> const& getAverageShardBytes, UID const& distributorId, - int const& teamSize); + int const& teamSize, + double* const& lastLimited); //Holds the permitted size and IO Bounds for a shard struct ShardSizeBounds { diff --git a/fdbserver/DataDistributionQueue.actor.cpp b/fdbserver/DataDistributionQueue.actor.cpp index f0b8d28d41..f282ef12bc 100644 --- a/fdbserver/DataDistributionQueue.actor.cpp +++ b/fdbserver/DataDistributionQueue.actor.cpp @@ -1201,10 +1201,10 @@ ACTOR Future dataDistributionQueue( MoveKeysLock lock, PromiseStream> getAverageShardBytes, UID distributorId, - int teamSize) + int teamSize, + double* lastLimited) { - state double lastLimited = 0; - state DDQueueData self( distributorId, lock, cx, teamCollections, shardsAffectedByTeamFailure, getAverageShardBytes, teamSize, output, input, getShardMetrics, &lastLimited ); + state DDQueueData self( distributorId, lock, cx, teamCollections, shardsAffectedByTeamFailure, getAverageShardBytes, teamSize, output, input, getShardMetrics, lastLimited ); state std::set serversToLaunchFrom; state KeyRange keysToLaunchFrom; state RelocateData launchData; diff --git a/fdbserver/Ratekeeper.actor.cpp b/fdbserver/Ratekeeper.actor.cpp index a0ff3ffd5e..46dcfe25f0 100644 --- a/fdbserver/Ratekeeper.actor.cpp +++ b/fdbserver/Ratekeeper.actor.cpp @@ -671,11 +671,13 @@ ACTOR Future rateKeeper(RatekeeperInterface rkInterf, Reference SERVER_KNOBS->LAST_LIMITED_RATIO * self.batchLimits.tpsLimit; double tooOld = now() - 1.0; for(auto p=self.proxy_transactionCounts.begin(); p!=self.proxy_transactionCounts.end(); ) { if (p->second.time < tooOld) @@ -707,6 +709,7 @@ ACTOR Future rateKeeper(RatekeeperInterface rkInterf, Reference Date: Sat, 9 Mar 2019 10:48:22 -0500 Subject: [PATCH 38/56] address review comments and bugs after running binding tester compared to python bindings --- bindings/go/src/fdb/tuple/tuple.go | 38 +++++++++++++++++------------- 1 file changed, 21 insertions(+), 17 deletions(-) diff --git a/bindings/go/src/fdb/tuple/tuple.go b/bindings/go/src/fdb/tuple/tuple.go index d3f289e2eb..5cc1990cc5 100644 --- a/bindings/go/src/fdb/tuple/tuple.go +++ b/bindings/go/src/fdb/tuple/tuple.go @@ -76,7 +76,7 @@ type UUID [16]byte // Versionstamp is struct for a FoundationDB verionstamp. Versionstamps are // 12 bytes long composed of a 10 byte transaction version and a 2 byte user // version. The transaction version is filled in at commit time and the user -// version is provided by your layer during a transaction. +// version is provided by the application to order results within a transaction. type Versionstamp struct { TransactionVersion [10]byte UserVersion uint16 @@ -295,9 +295,12 @@ func (p *packer) encodeUUID(u UUID) { func (p *packer) encodeVersionstamp(v Versionstamp) { p.putByte(versionstampCode) - if p.versionstampPos != -1 && v.TransactionVersion == incompleteTransactionVersion { - panic(fmt.Sprintf("Tuple can only contain one incomplete versionstamp")) - } else { + isIncomplete := v.TransactionVersion == incompleteTransactionVersion + if isIncomplete { + if p.versionstampPos != -1 { + panic(fmt.Sprintf("Tuple can only contain one incomplete versionstamp")) + } + p.versionstampPos = int32(len(p.buf)) } @@ -363,9 +366,9 @@ func (p *packer) encodeTuple(t Tuple, nested bool) { // Pack returns a new byte slice encoding the provided tuple. Pack will panic if // the tuple contains an element of any type other than []byte, // fdb.KeyConvertible, string, int64, int, uint64, uint, *big.Int, big.Int, float32, -// float64, bool, tuple.UUID, nil, or a Tuple with elements of valid types. It will -// also panic if an integer is specified with a value outside the range -// [-2**2040+1, 2**2040-1] +// float64, bool, tuple.UUID, tuple.Versionstamp, nil, or a Tuple with elements of +// valid types. It will also panic if an integer is specified with a value outside +// the range [-2**2040+1, 2**2040-1] // // Tuple satisfies the fdb.KeyConvertible interface, so it is not necessary to // call Pack when using a Tuple with a FoundationDB API function that requires a @@ -384,22 +387,25 @@ func (t Tuple) Pack() []byte { // operations. See Pack for more information. This function will return an error // if you attempt to pack a tuple with more than one versionstamp. This function will // return an error if you attempt to pack a tuple with a versionstamp position larger -// than an uint16 on apiVersion < 520. +// than an uint16 if the API version is less than 520. func (t Tuple) PackWithVersionstamp(prefix []byte) ([]byte, error) { hasVersionstamp, err := t.HasIncompleteVersionstamp() if err != nil { return nil, err } + apiVersion, err := fdb.GetAPIVersion() + if err != nil { + return nil, err + } + if hasVersionstamp == false { return nil, errors.New("No incomplete versionstamp included in tuple pack with versionstamp") } p := newPacker() - prefixLength := int32(0) if prefix != nil { - prefixLength = int32(len(prefix)) p.putBytes(prefix) } @@ -408,18 +414,16 @@ func (t Tuple) PackWithVersionstamp(prefix []byte) ([]byte, error) { if hasVersionstamp { var scratch [4]byte var offsetIndex int - - apiVersion := fdb.MustGetAPIVersion() if apiVersion < 520 { if p.versionstampPos > math.MaxUint16 { return nil, errors.New("Versionstamp position too large") } - offsetIndex = 1 - binary.LittleEndian.PutUint16(scratch[:], uint16(prefixLength+p.versionstampPos)) + offsetIndex = 2 + binary.LittleEndian.PutUint16(scratch[:], uint16(p.versionstampPos)) } else { - offsetIndex = 3 - binary.LittleEndian.PutUint32(scratch[:], uint32(prefixLength+p.versionstampPos)) + offsetIndex = 4 + binary.LittleEndian.PutUint32(scratch[:], uint32(p.versionstampPos)) } p.putBytes(scratch[0:offsetIndex]) @@ -439,7 +443,7 @@ func (t Tuple) HasIncompleteVersionstamp() (bool, error) { err = errors.New("Tuple can only contain one incomplete versionstamp") } - return incompleteCount == 1, err + return incompleteCount >= 1, err } func (t Tuple) countIncompleteVersionstamps() int { From 92167fd03f4d78b064328939adc73b5f91697c86 Mon Sep 17 00:00:00 2001 From: Ryan Worl Date: Sat, 9 Mar 2019 11:11:22 -0500 Subject: [PATCH 39/56] handle incomplete versionstamp attempting to be packed into vanilla tuple --- bindings/go/src/fdb/tuple/tuple.go | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/bindings/go/src/fdb/tuple/tuple.go b/bindings/go/src/fdb/tuple/tuple.go index 5cc1990cc5..a37ce5f3e8 100644 --- a/bindings/go/src/fdb/tuple/tuple.go +++ b/bindings/go/src/fdb/tuple/tuple.go @@ -307,7 +307,7 @@ func (p *packer) encodeVersionstamp(v Versionstamp) { p.putBytes(v.Bytes()) } -func (p *packer) encodeTuple(t Tuple, nested bool) { +func (p *packer) encodeTuple(t Tuple, nested bool, versionstamps bool) { if nested { p.putByte(nestedCode) } @@ -315,7 +315,7 @@ func (p *packer) encodeTuple(t Tuple, nested bool) { for i, e := range t { switch e := e.(type) { case Tuple: - p.encodeTuple(e, true) + p.encodeTuple(e, true, versionstamps) case nil: p.putByte(nilCode) if nested { @@ -352,6 +352,10 @@ func (p *packer) encodeTuple(t Tuple, nested bool) { case UUID: p.encodeUUID(e) case Versionstamp: + if versionstamps == false && e.TransactionVersion == incompleteTransactionVersion { + panic(fmt.Sprintf("Incomplete Versionstamp included in vanilla tuple pack")) + } + p.encodeVersionstamp(e) default: panic(fmt.Sprintf("unencodable element at index %d (%v, type %T)", i, t[i], t[i])) @@ -379,7 +383,7 @@ func (p *packer) encodeTuple(t Tuple, nested bool) { // func (t Tuple) Pack() []byte { p := newPacker() - p.encodeTuple(t, false) + p.encodeTuple(t, false, false) return p.buf } @@ -409,7 +413,7 @@ func (t Tuple) PackWithVersionstamp(prefix []byte) ([]byte, error) { p.putBytes(prefix) } - p.encodeTuple(t, false) + p.encodeTuple(t, false, true) if hasVersionstamp { var scratch [4]byte From c3043971663b8b443e1213e8b2aadee39127a180 Mon Sep 17 00:00:00 2001 From: Evan Tschannen Date: Sun, 10 Mar 2019 21:09:47 -0700 Subject: [PATCH 40/56] fixed a memory leak when allocating memory > 4K and < 8K --- flow/Arena.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/flow/Arena.h b/flow/Arena.h index ef5f77bda9..b29b244f0c 100644 --- a/flow/Arena.h +++ b/flow/Arena.h @@ -116,7 +116,7 @@ struct ArenaBlock : NonCopyable, ThreadSafeReferenceCounted { enum { SMALL = 64, - LARGE = 4097 // If size == used == LARGE, then use hugeSize, hugeUsed + LARGE = 8193 // If size == used == LARGE, then use hugeSize, hugeUsed }; enum { NOT_TINY = 255, TINY_HEADER = 6 }; From 98bf4ddf28af903885813ee166e27aa175dc0649 Mon Sep 17 00:00:00 2001 From: Evan Tschannen Date: Sun, 10 Mar 2019 21:19:35 -0700 Subject: [PATCH 41/56] watching the metadataVersionKey does not require ACCESS_SYSTEM_KEYS, this is not api versioned, as it is harmless to allow this for older api versions --- fdbclient/ReadYourWrites.actor.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fdbclient/ReadYourWrites.actor.cpp b/fdbclient/ReadYourWrites.actor.cpp index 16a6e4dbc2..6b854fbb7d 100644 --- a/fdbclient/ReadYourWrites.actor.cpp +++ b/fdbclient/ReadYourWrites.actor.cpp @@ -1678,7 +1678,7 @@ Future ReadYourWritesTransaction::watch(const Key& key) { if( options.readYourWritesDisabled ) return watches_disabled(); - if(key >= allKeys.end || (key >= getMaxReadKey() && tr.apiVersionAtLeast(300))) + if(key >= allKeys.end || (key >= getMaxReadKey() && key != metadataVersionKey && tr.apiVersionAtLeast(300))) return key_outside_legal_range(); if (key.size() > (key.startsWith(systemKeys.begin) ? CLIENT_KNOBS->SYSTEM_KEY_SIZE_LIMIT : CLIENT_KNOBS->KEY_SIZE_LIMIT)) From 041b2614643332c98e931489fac938357e0e559c Mon Sep 17 00:00:00 2001 From: Steve Atherton Date: Mon, 11 Mar 2019 01:48:51 -0700 Subject: [PATCH 42/56] Remove short option name for active snapshot interval --- fdbbackup/backup.actor.cpp | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/fdbbackup/backup.actor.cpp b/fdbbackup/backup.actor.cpp index 94527b2a7a..e692b5fa36 100644 --- a/fdbbackup/backup.actor.cpp +++ b/fdbbackup/backup.actor.cpp @@ -213,7 +213,6 @@ CSimpleOpt::SOption g_rgBackupModifyOptions[] = { { OPT_DESTCONTAINER, "--destcontainer", SO_REQ_SEP }, { OPT_SNAPSHOTINTERVAL, "-s", SO_REQ_SEP }, { OPT_SNAPSHOTINTERVAL, "--snapshot_interval", SO_REQ_SEP }, - { OPT_MOD_ACTIVE_INTERVAL, "-as", SO_REQ_SEP }, { OPT_MOD_ACTIVE_INTERVAL, "--active_snapshot_interval", SO_REQ_SEP }, SO_END_OF_OPTIONS @@ -828,7 +827,7 @@ static void printBackupUsage(bool devhelp) { printf(" -f, --force For expire operations, force expiration even if minimum restorability would be violated.\n"); printf(" -s, --snapshot_interval DURATION\n" " For start or modify operations, specifies the backup's default target snapshot interval as DURATION seconds. Defaults to %d for start operations.\n", CLIENT_KNOBS->BACKUP_DEFAULT_SNAPSHOT_INTERVAL_SEC); - printf(" -as, --active_snapshot_interval DURATION\n" + printf(" --active_snapshot_interval DURATION\n" " For modify operations, sets the desired interval for the backup's currently active snapshot, relative to the start of the snapshot.\n"); printf(" --verify_uid UID\n" " Specifies a UID to verify against the BackupUID of the running backup. If provided, the UID is verified in the same transaction\n" From 563f6a21b1db120a2dfc824706ea769aaa17e897 Mon Sep 17 00:00:00 2001 From: Steve Atherton Date: Mon, 11 Mar 2019 01:50:34 -0700 Subject: [PATCH 43/56] Update documentation for modify active snapshot interval option --- documentation/sphinx/source/backups.rst | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/documentation/sphinx/source/backups.rst b/documentation/sphinx/source/backups.rst index 772a6e84f2..338696257a 100644 --- a/documentation/sphinx/source/backups.rst +++ b/documentation/sphinx/source/backups.rst @@ -244,7 +244,7 @@ The ``modify`` subcommand is used to modify parameters of a running backup. All :: - user@host$ fdbbackup modify [-t ] [-d ] [-s ] [-as ] [--verify_uid ] + user@host$ fdbbackup modify [-t ] [-d ] [-s ] [--active_snapshot_interval ] [--verify_uid ] ``-d `` Sets a new Backup URL for the backup to write to. This is most likely to be used to change only URL parameters or account information. However, it can also be used to start writing to a new destination mid-backup. The new old location will cease gaining any additional restorability, while the new location will not be restorable until a new snapshot begins and completes. Full restorability would be regained, however, if the contents of the two destinations were to be combined by the user. @@ -252,7 +252,7 @@ The ``modify`` subcommand is used to modify parameters of a running backup. All ``-s `` or ``--snapshot_interval `` Sets a new duration for backup snapshots, in seconds. -``-as `` or ``--active_snapshot_interval `` +``--active_snapshot_interval `` Sets new duration for the backup's currently active snapshot, in seconds, relative to the start of the snapshot. ``--verify_uid `` From 314e87edfb74d1c0c9e6bb4e0604e62df7d8742e Mon Sep 17 00:00:00 2001 From: Jingyu Zhou Date: Mon, 11 Mar 2019 11:08:50 -0700 Subject: [PATCH 44/56] Create tar.gz file for python package On Windows, the default package format can be zip, while tar.gz is expected. --- bindings/python/CMakeLists.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/bindings/python/CMakeLists.txt b/bindings/python/CMakeLists.txt index 57259e7337..796185729e 100644 --- a/bindings/python/CMakeLists.txt +++ b/bindings/python/CMakeLists.txt @@ -62,7 +62,7 @@ endif() set(package_file_name foundationdb-${FDB_VERSION}.tar.gz) set(package_file ${CMAKE_BINARY_DIR}/packages/${package_file_name}) add_custom_command(OUTPUT ${package_file} - COMMAND $ setup.py sdist && + COMMAND $ setup.py sdist --formats=gztar && ${CMAKE_COMMAND} -E copy dist/${package_file_name} ${package_file} WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR} COMMENT "Create Python sdist package") From d90da27ee5f8d345ca4e47cd75eda97223f5218a Mon Sep 17 00:00:00 2001 From: Ryan Worl Date: Mon, 11 Mar 2019 14:33:32 -0400 Subject: [PATCH 45/56] Add Go to list of supported bindings for `set_versionstamped_key` in the Tuple layer. --- documentation/sphinx/source/api-common.rst.inc | 4 ++-- fdbclient/vexillographer/fdb.options | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/documentation/sphinx/source/api-common.rst.inc b/documentation/sphinx/source/api-common.rst.inc index 80f26fcd9b..6ce102359f 100644 --- a/documentation/sphinx/source/api-common.rst.inc +++ b/documentation/sphinx/source/api-common.rst.inc @@ -142,10 +142,10 @@ A transaction is not permitted to read any transformed key or value previously set within that transaction, and an attempt to do so will result in an error. .. |atomic-versionstamps-tuple-warning-key| replace:: - At this time, versionstamped keys are not compatible with the Tuple layer except in Java and Python. Note that this implies versionstamped keys may not be used with the Subspace and Directory layers except in those languages. + At this time, versionstamped keys are not compatible with the Tuple layer except in Java, Python, and Go. Note that this implies versionstamped keys may not be used with the Subspace and Directory layers except in those languages. .. |atomic-versionstamps-tuple-warning-value| replace:: - At this time, versionstamped values are not compatible with the Tuple layer except in Java and Python. Note that this implies versionstamped values may not be used with the Subspace and Directory layers except in those languages. + At this time, versionstamped values are not compatible with the Tuple layer except in Java, Python, and Go. Note that this implies versionstamped values may not be used with the Subspace and Directory layers except in those languages. .. |api-version| replace:: 610 diff --git a/fdbclient/vexillographer/fdb.options b/fdbclient/vexillographer/fdb.options index d3161632b1..2194139224 100644 --- a/fdbclient/vexillographer/fdb.options +++ b/fdbclient/vexillographer/fdb.options @@ -247,10 +247,10 @@ description is not currently required but encouraged. description="Performs a little-endian comparison of byte strings. If the existing value in the database is not present, then ``param`` is stored in the database. If the existing value in the database is shorter than ``param``, it is first extended to the length of ``param`` with zero bytes. If ``param`` is shorter than the existing value in the database, the existing value is truncated to match the length of ``param``. The smaller of the two values is then stored in the database."/>