mirror of https://github.com/apache/cassandra
CASSANDRA-19783 - InstanceClassLoader leak detection
This commit (along with the related in-jvm-dtest API change) adds the ability for ResourceLeakTest to actually detect InstanceClassLoader leaks in 3 loops. In order to find these in CI: - Pull in the in-jvm dtest API changes in 0.17.0 - Enable the looperEverythingTest to run (but not the others, which remain ignored) In addition, this commit updates netty to 4.1.113 as the version of netty previously used (4.1.96) caused a classloader leak (fixed in Netty 4.1.98) Patch by Doug Rohrer and Sam Tunnicliffe (in-jvm-dtest fix); reviewed by Mick Semb Wever for CASSANDRA-19783 and CASSANDRA-19239
This commit is contained in:
parent
8af0e390be
commit
f2c41accf8
|
@ -38,6 +38,7 @@
|
|||
<properties>
|
||||
<bytebuddy.version>1.12.13</bytebuddy.version>
|
||||
<byteman.version>4.0.20</byteman.version>
|
||||
<netty.version>4.1.113.Final</netty.version>
|
||||
<ohc.version>0.5.1</ohc.version>
|
||||
|
||||
<!-- These are referenced in build.xml, so need to be propagated from there -->
|
||||
|
@ -522,7 +523,7 @@
|
|||
<dependency>
|
||||
<groupId>org.apache.cassandra</groupId>
|
||||
<artifactId>dtest-api</artifactId>
|
||||
<version>0.0.16</version>
|
||||
<version>0.0.17</version>
|
||||
<scope>test</scope>
|
||||
</dependency>
|
||||
<dependency>
|
||||
|
@ -728,7 +729,7 @@
|
|||
<dependency>
|
||||
<groupId>io.netty</groupId>
|
||||
<artifactId>netty-all</artifactId>
|
||||
<version>4.1.96.Final</version>
|
||||
<version>${netty.version}</version>
|
||||
<exclusions>
|
||||
<exclusion>
|
||||
<groupId>io.netty</groupId>
|
||||
|
@ -822,18 +823,18 @@
|
|||
<dependency>
|
||||
<groupId>io.netty</groupId>
|
||||
<artifactId>netty-transport-native-epoll</artifactId>
|
||||
<version>4.1.96.Final</version>
|
||||
<version>${netty.version}</version>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>io.netty</groupId>
|
||||
<artifactId>netty-transport-native-epoll</artifactId>
|
||||
<version>4.1.96.Final</version>
|
||||
<version>${netty.version}</version>
|
||||
<classifier>linux-x86_64</classifier>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>io.netty</groupId>
|
||||
<artifactId>netty-transport-native-epoll</artifactId>
|
||||
<version>4.1.96.Final</version>
|
||||
<version>${netty.version}</version>
|
||||
<classifier>linux-aarch_64</classifier>
|
||||
</dependency>
|
||||
|
||||
|
|
|
@ -1,4 +1,6 @@
|
|||
5.1
|
||||
* Update dtest-api to 0.0.17 to fix jvm17 crash in jvm-dtests (CASSANDRA-19239)
|
||||
* Add resource leak test and Update Netty to 4.1.113.Final to fix leak (CASSANDRA-19783)
|
||||
* Fix incorrect nodetool suggestion when gossip mode is running (CASSANDRA-19905)
|
||||
* SAI support for BETWEEN operator (CASSANDRA-19688)
|
||||
* Fix BETWEEN filtering for reversed clustering columns (CASSANDRA-19878)
|
||||
|
@ -264,12 +266,12 @@ Merged from 4.0:
|
|||
Merged from 3.11:
|
||||
* Revert CASSANDRA-18543 (CASSANDRA-18854)
|
||||
Merged from 3.0:
|
||||
* Suppress CVE-2023-6378 (CASSANDRA-19142)
|
||||
* Suppress CVE-2023-6378 (CASSANDRA-19142)
|
||||
* Do not set RPC_READY to false on transports shutdown in order to not fail counter updates for deployments with coordinator and storage nodes with transports turned off (CASSANDRA-18935)
|
||||
* Suppress CVE-2023-44487 (CASSANDRA-18943)
|
||||
* Implement the logic in bin/stop-server (CASSANDRA-18838)
|
||||
* Fix nodetool enable/disablebinary to correctly set rpc readiness in gossip (CASSANDRA-18935)
|
||||
* Implement the logic in bin/stop-server (CASSANDRA-18838)
|
||||
* Implement the logic in bin/stop-server (CASSANDRA-18838)
|
||||
* Upgrade snappy-java to 1.1.10.4 (CASSANDRA-18878)
|
||||
* Add cqlshrc.sample and credentials.sample into Debian package (CASSANDRA-18818)
|
||||
|
||||
|
|
|
@ -1213,8 +1213,8 @@ public abstract class AbstractCluster<I extends IInstance> implements ICluster<I
|
|||
return false;
|
||||
|
||||
return shared.contains(s) ||
|
||||
InstanceClassLoader.getDefaultLoadSharedFilter().test(s) ||
|
||||
s.startsWith("org.jboss.byteman");
|
||||
InstanceClassLoader.getDefaultLoadSharedFilter().test(s)
|
||||
|| s.startsWith("org.jboss.byteman.");
|
||||
};
|
||||
}
|
||||
|
||||
|
|
|
@ -18,7 +18,10 @@
|
|||
|
||||
package org.apache.cassandra.distributed.test;
|
||||
|
||||
import java.net.InetSocketAddress;
|
||||
|
||||
import org.apache.cassandra.distributed.Cluster;
|
||||
import org.apache.cassandra.distributed.api.IInstance;
|
||||
import org.apache.cassandra.distributed.api.IInstanceConfig;
|
||||
import org.apache.cassandra.distributed.api.IInvokableInstance;
|
||||
import org.apache.cassandra.distributed.api.IIsolatedExecutor;
|
||||
|
@ -53,21 +56,27 @@ public class NativeProtocolTest extends TestBaseImpl
|
|||
@Test
|
||||
public void withClientRequests() throws Throwable
|
||||
{
|
||||
try (ICluster ignored = init(builder().withNodes(3)
|
||||
try (ICluster cluster = init(builder().withNodes(3)
|
||||
.withConfig(config -> config.with(GOSSIP, NETWORK, NATIVE_PROTOCOL))
|
||||
.start()))
|
||||
{
|
||||
testNativeRequests(cluster);
|
||||
}
|
||||
}
|
||||
|
||||
try (com.datastax.driver.core.Cluster cluster = com.datastax.driver.core.Cluster.builder().addContactPoint("127.0.0.1").build();
|
||||
Session session = cluster.connect())
|
||||
{
|
||||
session.execute("CREATE TABLE " + KEYSPACE + ".tbl (pk int, ck int, v int, PRIMARY KEY (pk, ck));");
|
||||
session.execute("INSERT INTO " + KEYSPACE + ".tbl (pk, ck, v) values (1,1,1);");
|
||||
Statement select = new SimpleStatement("select * from " + KEYSPACE + ".tbl;").setConsistencyLevel(ConsistencyLevel.ALL);
|
||||
final ResultSet resultSet = session.execute(select);
|
||||
assertRows(RowUtil.toObjects(resultSet), row(1, 1, 1));
|
||||
Assert.assertEquals(3, cluster.getMetadata().getAllHosts().size());
|
||||
}
|
||||
public static void testNativeRequests(ICluster dtestCluster)
|
||||
{
|
||||
IInstance inst = dtestCluster.get(1);
|
||||
final InetSocketAddress host = inst.broadcastAddress();
|
||||
try (com.datastax.driver.core.Cluster cluster = com.datastax.driver.core.Cluster.builder().addContactPoint("127.0.0.1").build();
|
||||
Session session = cluster.connect())
|
||||
{
|
||||
session.execute("CREATE TABLE " + KEYSPACE + ".tbl (pk int, ck int, v int, PRIMARY KEY (pk, ck));");
|
||||
session.execute("INSERT INTO " + KEYSPACE + ".tbl (pk, ck, v) values (1,1,1);");
|
||||
Statement select = new SimpleStatement("select * from " + KEYSPACE + ".tbl;").setConsistencyLevel(ConsistencyLevel.ALL);
|
||||
final ResultSet resultSet = session.execute(select);
|
||||
assertRows(RowUtil.toObjects(resultSet), row(1, 1, 1));
|
||||
Assert.assertEquals(dtestCluster.size(), cluster.getMetadata().getAllHosts().size());
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
@ -24,11 +24,14 @@ import java.nio.file.FileSystems;
|
|||
import java.nio.file.Path;
|
||||
import java.sql.Date;
|
||||
import java.text.SimpleDateFormat;
|
||||
import java.util.concurrent.TimeUnit;
|
||||
import java.util.function.Consumer;
|
||||
import javax.management.MBeanServer;
|
||||
import javax.management.MBeanServerConnection;
|
||||
import javax.management.remote.JMXConnector;
|
||||
|
||||
import org.apache.cassandra.distributed.shared.Uninterruptibles;
|
||||
import org.apache.cassandra.distributed.shared.InstanceClassLoader;
|
||||
import org.junit.Assert;
|
||||
import org.junit.Ignore;
|
||||
import org.junit.Test;
|
||||
|
@ -36,7 +39,6 @@ import org.junit.Test;
|
|||
import com.sun.management.HotSpotDiagnosticMXBean;
|
||||
import org.apache.cassandra.distributed.Cluster;
|
||||
import org.apache.cassandra.distributed.api.ConsistencyLevel;
|
||||
import org.apache.cassandra.distributed.api.Feature;
|
||||
import org.apache.cassandra.distributed.api.IInstanceConfig;
|
||||
import org.apache.cassandra.distributed.api.IInvokableInstance;
|
||||
import org.apache.cassandra.distributed.shared.JMXUtil;
|
||||
|
@ -49,6 +51,7 @@ import static org.apache.cassandra.distributed.api.Feature.NATIVE_PROTOCOL;
|
|||
import static org.apache.cassandra.distributed.api.Feature.NETWORK;
|
||||
import static org.apache.cassandra.distributed.test.jmx.JMXGetterCheckTest.testAllValidGetters;
|
||||
import static org.apache.cassandra.utils.FBUtilities.now;
|
||||
import static org.hamcrest.Matchers.lessThanOrEqualTo;
|
||||
import static org.hamcrest.Matchers.startsWith;
|
||||
|
||||
/* Resource Leak Test - useful when tracking down issues with in-JVM framework cleanup.
|
||||
|
@ -63,14 +66,17 @@ import static org.hamcrest.Matchers.startsWith;
|
|||
* the final hprof and check that the class loaders are not reachable from a GC root),
|
||||
* but it shows that the file handles for Data/Index files are being leaked.
|
||||
*/
|
||||
@Ignore
|
||||
|
||||
public class ResourceLeakTest extends TestBaseImpl
|
||||
{
|
||||
// Parameters to adjust while hunting for leaks
|
||||
final int numTestLoops = 1; // Set this value high to crash on leaks, or low when tracking down an issue.
|
||||
final boolean dumpEveryLoop = false; // Dump heap & possibly files every loop
|
||||
// numTestLoops should be >= 3, and numClusterNodes >= 2, when committed to source control.
|
||||
// This ensures the instance class loader assertions will fail quickly and reliably when a
|
||||
// new InstanceClassLoader leak is introduced.
|
||||
final int numTestLoops = 3;
|
||||
final int numClusterNodes = 2;
|
||||
final boolean dumpEveryLoop = true; // Dump heap & possibly files every loop
|
||||
final boolean dumpFileHandles = false; // Call lsof whenever dumping resources
|
||||
final boolean forceCollection = false; // Whether to explicitly force finalization/gc for smaller heap dumps
|
||||
final long finalWaitMillis = 0L; // Number of millis to wait before final resource dump to give gc a chance
|
||||
|
||||
static final SimpleDateFormat format = new SimpleDateFormat("yyyyMMddHHmmss");
|
||||
|
@ -169,17 +175,31 @@ public class ResourceLeakTest extends TestBaseImpl
|
|||
}
|
||||
}
|
||||
|
||||
void doTest(int numClusterNodes, Consumer<IInstanceConfig> updater) throws Throwable
|
||||
{
|
||||
doTest(numClusterNodes, updater, ignored -> {});
|
||||
void checkForInstanceClassLoaderLeaks(int maxAllowableInstances, int loop) throws IOException, InterruptedException {
|
||||
for (int i = 0; InstanceClassLoader.getApproximateLiveLoaderCount(true) > maxAllowableInstances && i < 120; i++) {
|
||||
Uninterruptibles.sleepUninterruptibly(1, TimeUnit.SECONDS);
|
||||
}
|
||||
int approximateLiveLoaderCount = InstanceClassLoader.getApproximateLiveLoaderCount(true);
|
||||
if (approximateLiveLoaderCount > maxAllowableInstances) {
|
||||
dumpResources(String.format("InstanceClassLoader max reached at loop %d", loop));
|
||||
Assert.assertThat("InstanceClassLoader leak detected",
|
||||
approximateLiveLoaderCount,
|
||||
lessThanOrEqualTo(maxAllowableInstances));
|
||||
}
|
||||
}
|
||||
|
||||
void doTest(int numClusterNodes, Consumer<IInstanceConfig> updater, Consumer<Cluster> actionToPerform) throws Throwable
|
||||
void doTest(Consumer<IInstanceConfig> updater) throws Throwable
|
||||
{
|
||||
doTest(updater, ignored -> {}, false);
|
||||
}
|
||||
|
||||
void doTest(Consumer<IInstanceConfig> updater, Consumer<Cluster> actionToPerform, boolean shouldCheckForClassloaderLeaks) throws Throwable
|
||||
{
|
||||
for (int loop = 0; loop < numTestLoops; loop++)
|
||||
{
|
||||
System.out.println(String.format("========== Starting loop %03d ========", loop));
|
||||
try (Cluster cluster = (Cluster) builder().withNodes(numClusterNodes).withConfig(updater).start())
|
||||
Cluster.Builder builder = builder();
|
||||
try (Cluster cluster = (Cluster) builder.withNodes(numClusterNodes).withConfig(updater).start())
|
||||
{
|
||||
init(cluster);
|
||||
String tableName = "tbl" + loop;
|
||||
|
@ -191,87 +211,88 @@ public class ResourceLeakTest extends TestBaseImpl
|
|||
{
|
||||
dumpResources(String.format("loop%03d", loop));
|
||||
}
|
||||
// We add 2 to the number of allowed classloaders to provide some wiggle room, as GC is non-deterministic
|
||||
// and some threads don't always shut down in time
|
||||
if (shouldCheckForClassloaderLeaks)
|
||||
{
|
||||
checkForInstanceClassLoaderLeaks(numClusterNodes + 2, loop);
|
||||
}
|
||||
}
|
||||
catch (AssertionError ae) {
|
||||
throw ae;
|
||||
}
|
||||
catch (Throwable tr)
|
||||
{
|
||||
System.out.println("Dumping resources for exception: " + tr.getMessage());
|
||||
System.out.println("Dumping resources for exception: " + tr);
|
||||
tr.printStackTrace();
|
||||
dumpResources("exception");
|
||||
}
|
||||
if (forceCollection)
|
||||
{
|
||||
System.runFinalization();
|
||||
System.gc();
|
||||
}
|
||||
System.runFinalization();
|
||||
System.gc();
|
||||
System.out.println(String.format("========== Completed loop %03d ========", loop));
|
||||
}
|
||||
}
|
||||
|
||||
@Ignore("Only run if debugging an issue")
|
||||
@Test
|
||||
public void looperTest() throws Throwable
|
||||
{
|
||||
doTest(1, config -> {});
|
||||
if (forceCollection)
|
||||
{
|
||||
System.runFinalization();
|
||||
System.gc();
|
||||
Thread.sleep(finalWaitMillis);
|
||||
}
|
||||
doTest(config -> {});
|
||||
System.runFinalization();
|
||||
System.gc();
|
||||
Thread.sleep(finalWaitMillis);
|
||||
dumpResources("final");
|
||||
}
|
||||
|
||||
@Ignore("Only run if debugging an issue")
|
||||
@Test
|
||||
public void looperGossipNetworkTest() throws Throwable
|
||||
{
|
||||
doTest(2, config -> config.with(GOSSIP).with(NETWORK));
|
||||
if (forceCollection)
|
||||
{
|
||||
System.runFinalization();
|
||||
System.gc();
|
||||
Thread.sleep(finalWaitMillis);
|
||||
}
|
||||
doTest(config -> config.with(GOSSIP).with(NETWORK));
|
||||
System.runFinalization();
|
||||
System.gc();
|
||||
Thread.sleep(finalWaitMillis);
|
||||
dumpResources("final-gossip-network");
|
||||
}
|
||||
|
||||
@Ignore("Only run if debugging an issue")
|
||||
@Test
|
||||
public void looperNativeTest() throws Throwable
|
||||
{
|
||||
doTest(2, config -> config.with(NATIVE_PROTOCOL));
|
||||
if (forceCollection)
|
||||
{
|
||||
System.runFinalization();
|
||||
System.gc();
|
||||
Thread.sleep(finalWaitMillis);
|
||||
}
|
||||
doTest(config -> config.with(NATIVE_PROTOCOL));
|
||||
System.runFinalization();
|
||||
System.gc();
|
||||
Thread.sleep(finalWaitMillis);
|
||||
dumpResources("final-native");
|
||||
}
|
||||
|
||||
@Ignore("Only run if debugging an issue")
|
||||
@Test
|
||||
public void looperJmxTest() throws Throwable
|
||||
{
|
||||
doTest(2, config -> config.with(JMX), ResourceLeakTest::testJmx);
|
||||
if (forceCollection)
|
||||
{
|
||||
System.runFinalization();
|
||||
System.gc();
|
||||
Thread.sleep(finalWaitMillis);
|
||||
}
|
||||
doTest(config -> config.with(JMX), ResourceLeakTest::testJmx, false);
|
||||
System.runFinalization();
|
||||
System.gc();
|
||||
Thread.sleep(finalWaitMillis);
|
||||
dumpResources("final-jmx");
|
||||
}
|
||||
|
||||
/**
|
||||
* This test is enabled in an attempt to automatically catch InstanceClassloader leaks.
|
||||
* Depending on the type of leak, we may need to actually exercise functionality like JMX or Native
|
||||
* beyond just enabling the feature, so we use the "everything" test even though it may take longer to run.
|
||||
*/
|
||||
@Test
|
||||
public void looperEverythingTest() throws Throwable
|
||||
{
|
||||
doTest(2, config -> config.with(Feature.values()),
|
||||
doTest(config -> config.with(NETWORK, GOSSIP, NATIVE_PROTOCOL, JMX), // Exclude `BLANK_GOSSIP`
|
||||
cluster -> {
|
||||
NativeProtocolTest.testNativeRequests(cluster);
|
||||
testJmx(cluster);
|
||||
});
|
||||
if (forceCollection)
|
||||
{
|
||||
System.runFinalization();
|
||||
System.gc();
|
||||
Thread.sleep(finalWaitMillis);
|
||||
}
|
||||
}, true);
|
||||
System.runFinalization();
|
||||
System.gc();
|
||||
Thread.sleep(finalWaitMillis);
|
||||
dumpResources("final-everything");
|
||||
}
|
||||
}
|
||||
|
|
Loading…
Reference in New Issue