Merge pull request #2305 from etschannen/release-6.2

merges crossing systemKeys.begin did not decrement systemSizeEstimate
2019-11-01 09:12:01 -07:00 · 2019-11-01 09:12:01 -07:00 · 1dc5985062
parent 5f6204b550 7f75eca7cb
commit 1dc5985062
2 changed files with 18 additions and 1 deletions
--- a/documentation/sphinx/source/release-notes.rst
+++ b/documentation/sphinx/source/release-notes.rst
@ -2,6 +2,14 @@
 Release Notes
 #############

+6.2.8
+=====
+
+Fixes
+-----
+
+* The ``system_kv_size_bytes`` status field could report a size much larger than the actual size of the system keyspace. `(PR #2305) <https://github.com/apple/foundationdb/pull/2305>`_.
+
 6.2.7
 =====

@ -133,7 +141,6 @@ Fixes only impacting 6.2.0+
 * The cluster controller would saturate its CPU for a few seconds when sending configuration information to all of the worker processes. [6.2.4] `(PR #2086) <https://github.com/apple/foundationdb/pull/2086>`_.
 * The data distributor would build all possible team combinations if it was tracking an unhealthy server with less than 10 teams. [6.2.4] `(PR #2099) <https://github.com/apple/foundationdb/pull/2099>`_.
 * The cluster controller could crash if a coordinator was unreachable when compiling cluster status. [6.2.4] `(PR #2065) <https://github.com/apple/foundationdb/pull/2065>`_.
-* The cluster controller could crash if a coordinator was unreachable when compiling cluster status. [6.2.4] `(PR #2065) <https://github.com/apple/foundationdb/pull/2065>`_.
 * A storage server could crash if it took longer than 10 minutes to fetch a key range from another server. [6.2.5] `(PR #2170) <https://github.com/apple/foundationdb/pull/2170>`_.
 * Excluding or including servers would restart the data distributor. [6.2.5] `(PR #2170) <https://github.com/apple/foundationdb/pull/2170>`_.
 * The data distributor could read invalid memory when estimating database size. [6.2.6] `(PR #2225) <https://github.com/apple/foundationdb/pull/2225>`_.
--- a/fdbserver/DataDistributionTracker.actor.cpp
+++ b/fdbserver/DataDistributionTracker.actor.cpp
@ -402,6 +402,7 @@ Future<Void> shardMerger(
 	bool forwardComplete = false;
 	KeyRangeRef merged;
 	StorageMetrics endingStats = shardSize->get().get();
+	int64_t systemBytes = keys.begin >= systemKeys.begin ? shardSize->get().get().bytes : 0; 

 	loop {
 		Optional<StorageMetrics> newMetrics;
@ -439,6 +440,9 @@ Future<Void> shardMerger(

 		merged = KeyRangeRef( prevIter->range().begin, nextIter->range().end );
 		endingStats += newMetrics.get();
+		if((forwardComplete ? prevIter->range().begin : nextIter->range().begin) >= systemKeys.begin) {
+			systemBytes += newMetrics.get().bytes;
+		}
 		shardsMerged++;

 		auto shardBounds = getShardSizeBounds( merged, maxShardSize );
@ -457,6 +461,9 @@ Future<Void> shardMerger(

 			// If going forward, remove most recently added range
 			endingStats -= newMetrics.get();
+			if(nextIter->range().begin >= systemKeys.begin) {
+				systemBytes -= newMetrics.get().bytes;
+			}
 			shardsMerged--;
 			--nextIter;
 			merged = KeyRangeRef( prevIter->range().begin, nextIter->range().end );
@ -473,6 +480,9 @@ Future<Void> shardMerger(
 		.detail("EndingSize", endingStats.bytes)
 		.detail("BatchedMerges", shardsMerged);

+	if(mergeRange.begin < systemKeys.begin) {
+		self->systemSizeEstimate -= systemBytes;
+	}
 	restartShardTrackers( self, mergeRange, endingStats );
 	self->shardsAffectedByTeamFailure->defineShard( mergeRange );
 	self->output.send( RelocateShard( mergeRange, SERVER_KNOBS->PRIORITY_MERGE_SHARD ) );