CASSANDRA-19783 - InstanceClassLoader leak detection

This commit (along with the related in-jvm-dtest API change) adds the ability for ResourceLeakTest
to actually detect InstanceClassLoader leaks in 3 loops. In order to find these in CI:
- Pull in the in-jvm dtest API changes in 0.17.0
- Enable the looperEverythingTest to run (but not the others, which remain ignored)

In addition, this commit updates netty to 4.1.113 as the version
of netty previously used (4.1.96) caused a classloader leak (fixed in
Netty 4.1.98)

Patch by Doug Rohrer and Sam Tunnicliffe (in-jvm-dtest fix); reviewed by Mick Semb Wever for CASSANDRA-19783
and CASSANDRA-19239
This commit is contained in:
Doug Rohrer 2024-06-28 16:56:03 -04:00 committed by Doug Rohrer
parent 8af0e390be
commit f2c41accf8
5 changed files with 105 additions and 72 deletions

View File

@ -38,6 +38,7 @@
<properties>
<bytebuddy.version>1.12.13</bytebuddy.version>
<byteman.version>4.0.20</byteman.version>
<netty.version>4.1.113.Final</netty.version>
<ohc.version>0.5.1</ohc.version>
<!-- These are referenced in build.xml, so need to be propagated from there -->
@ -522,7 +523,7 @@
<dependency>
<groupId>org.apache.cassandra</groupId>
<artifactId>dtest-api</artifactId>
<version>0.0.16</version>
<version>0.0.17</version>
<scope>test</scope>
</dependency>
<dependency>
@ -728,7 +729,7 @@
<dependency>
<groupId>io.netty</groupId>
<artifactId>netty-all</artifactId>
<version>4.1.96.Final</version>
<version>${netty.version}</version>
<exclusions>
<exclusion>
<groupId>io.netty</groupId>
@ -822,18 +823,18 @@
<dependency>
<groupId>io.netty</groupId>
<artifactId>netty-transport-native-epoll</artifactId>
<version>4.1.96.Final</version>
<version>${netty.version}</version>
</dependency>
<dependency>
<groupId>io.netty</groupId>
<artifactId>netty-transport-native-epoll</artifactId>
<version>4.1.96.Final</version>
<version>${netty.version}</version>
<classifier>linux-x86_64</classifier>
</dependency>
<dependency>
<groupId>io.netty</groupId>
<artifactId>netty-transport-native-epoll</artifactId>
<version>4.1.96.Final</version>
<version>${netty.version}</version>
<classifier>linux-aarch_64</classifier>
</dependency>

View File

@ -1,4 +1,6 @@
5.1
* Update dtest-api to 0.0.17 to fix jvm17 crash in jvm-dtests (CASSANDRA-19239)
* Add resource leak test and Update Netty to 4.1.113.Final to fix leak (CASSANDRA-19783)
* Fix incorrect nodetool suggestion when gossip mode is running (CASSANDRA-19905)
* SAI support for BETWEEN operator (CASSANDRA-19688)
* Fix BETWEEN filtering for reversed clustering columns (CASSANDRA-19878)
@ -264,12 +266,12 @@ Merged from 4.0:
Merged from 3.11:
* Revert CASSANDRA-18543 (CASSANDRA-18854)
Merged from 3.0:
* Suppress CVE-2023-6378 (CASSANDRA-19142)
* Suppress CVE-2023-6378 (CASSANDRA-19142)
* Do not set RPC_READY to false on transports shutdown in order to not fail counter updates for deployments with coordinator and storage nodes with transports turned off (CASSANDRA-18935)
* Suppress CVE-2023-44487 (CASSANDRA-18943)
* Implement the logic in bin/stop-server (CASSANDRA-18838)
* Fix nodetool enable/disablebinary to correctly set rpc readiness in gossip (CASSANDRA-18935)
* Implement the logic in bin/stop-server (CASSANDRA-18838)
* Implement the logic in bin/stop-server (CASSANDRA-18838)
* Upgrade snappy-java to 1.1.10.4 (CASSANDRA-18878)
* Add cqlshrc.sample and credentials.sample into Debian package (CASSANDRA-18818)

View File

@ -1213,8 +1213,8 @@ public abstract class AbstractCluster<I extends IInstance> implements ICluster<I
return false;
return shared.contains(s) ||
InstanceClassLoader.getDefaultLoadSharedFilter().test(s) ||
s.startsWith("org.jboss.byteman");
InstanceClassLoader.getDefaultLoadSharedFilter().test(s)
|| s.startsWith("org.jboss.byteman.");
};
}

View File

@ -18,7 +18,10 @@
package org.apache.cassandra.distributed.test;
import java.net.InetSocketAddress;
import org.apache.cassandra.distributed.Cluster;
import org.apache.cassandra.distributed.api.IInstance;
import org.apache.cassandra.distributed.api.IInstanceConfig;
import org.apache.cassandra.distributed.api.IInvokableInstance;
import org.apache.cassandra.distributed.api.IIsolatedExecutor;
@ -53,21 +56,27 @@ public class NativeProtocolTest extends TestBaseImpl
@Test
public void withClientRequests() throws Throwable
{
try (ICluster ignored = init(builder().withNodes(3)
try (ICluster cluster = init(builder().withNodes(3)
.withConfig(config -> config.with(GOSSIP, NETWORK, NATIVE_PROTOCOL))
.start()))
{
testNativeRequests(cluster);
}
}
try (com.datastax.driver.core.Cluster cluster = com.datastax.driver.core.Cluster.builder().addContactPoint("127.0.0.1").build();
Session session = cluster.connect())
{
session.execute("CREATE TABLE " + KEYSPACE + ".tbl (pk int, ck int, v int, PRIMARY KEY (pk, ck));");
session.execute("INSERT INTO " + KEYSPACE + ".tbl (pk, ck, v) values (1,1,1);");
Statement select = new SimpleStatement("select * from " + KEYSPACE + ".tbl;").setConsistencyLevel(ConsistencyLevel.ALL);
final ResultSet resultSet = session.execute(select);
assertRows(RowUtil.toObjects(resultSet), row(1, 1, 1));
Assert.assertEquals(3, cluster.getMetadata().getAllHosts().size());
}
public static void testNativeRequests(ICluster dtestCluster)
{
IInstance inst = dtestCluster.get(1);
final InetSocketAddress host = inst.broadcastAddress();
try (com.datastax.driver.core.Cluster cluster = com.datastax.driver.core.Cluster.builder().addContactPoint("127.0.0.1").build();
Session session = cluster.connect())
{
session.execute("CREATE TABLE " + KEYSPACE + ".tbl (pk int, ck int, v int, PRIMARY KEY (pk, ck));");
session.execute("INSERT INTO " + KEYSPACE + ".tbl (pk, ck, v) values (1,1,1);");
Statement select = new SimpleStatement("select * from " + KEYSPACE + ".tbl;").setConsistencyLevel(ConsistencyLevel.ALL);
final ResultSet resultSet = session.execute(select);
assertRows(RowUtil.toObjects(resultSet), row(1, 1, 1));
Assert.assertEquals(dtestCluster.size(), cluster.getMetadata().getAllHosts().size());
}
}

View File

@ -24,11 +24,14 @@ import java.nio.file.FileSystems;
import java.nio.file.Path;
import java.sql.Date;
import java.text.SimpleDateFormat;
import java.util.concurrent.TimeUnit;
import java.util.function.Consumer;
import javax.management.MBeanServer;
import javax.management.MBeanServerConnection;
import javax.management.remote.JMXConnector;
import org.apache.cassandra.distributed.shared.Uninterruptibles;
import org.apache.cassandra.distributed.shared.InstanceClassLoader;
import org.junit.Assert;
import org.junit.Ignore;
import org.junit.Test;
@ -36,7 +39,6 @@ import org.junit.Test;
import com.sun.management.HotSpotDiagnosticMXBean;
import org.apache.cassandra.distributed.Cluster;
import org.apache.cassandra.distributed.api.ConsistencyLevel;
import org.apache.cassandra.distributed.api.Feature;
import org.apache.cassandra.distributed.api.IInstanceConfig;
import org.apache.cassandra.distributed.api.IInvokableInstance;
import org.apache.cassandra.distributed.shared.JMXUtil;
@ -49,6 +51,7 @@ import static org.apache.cassandra.distributed.api.Feature.NATIVE_PROTOCOL;
import static org.apache.cassandra.distributed.api.Feature.NETWORK;
import static org.apache.cassandra.distributed.test.jmx.JMXGetterCheckTest.testAllValidGetters;
import static org.apache.cassandra.utils.FBUtilities.now;
import static org.hamcrest.Matchers.lessThanOrEqualTo;
import static org.hamcrest.Matchers.startsWith;
/* Resource Leak Test - useful when tracking down issues with in-JVM framework cleanup.
@ -63,14 +66,17 @@ import static org.hamcrest.Matchers.startsWith;
* the final hprof and check that the class loaders are not reachable from a GC root),
* but it shows that the file handles for Data/Index files are being leaked.
*/
@Ignore
public class ResourceLeakTest extends TestBaseImpl
{
// Parameters to adjust while hunting for leaks
final int numTestLoops = 1; // Set this value high to crash on leaks, or low when tracking down an issue.
final boolean dumpEveryLoop = false; // Dump heap & possibly files every loop
// numTestLoops should be >= 3, and numClusterNodes >= 2, when committed to source control.
// This ensures the instance class loader assertions will fail quickly and reliably when a
// new InstanceClassLoader leak is introduced.
final int numTestLoops = 3;
final int numClusterNodes = 2;
final boolean dumpEveryLoop = true; // Dump heap & possibly files every loop
final boolean dumpFileHandles = false; // Call lsof whenever dumping resources
final boolean forceCollection = false; // Whether to explicitly force finalization/gc for smaller heap dumps
final long finalWaitMillis = 0L; // Number of millis to wait before final resource dump to give gc a chance
static final SimpleDateFormat format = new SimpleDateFormat("yyyyMMddHHmmss");
@ -169,17 +175,31 @@ public class ResourceLeakTest extends TestBaseImpl
}
}
void doTest(int numClusterNodes, Consumer<IInstanceConfig> updater) throws Throwable
{
doTest(numClusterNodes, updater, ignored -> {});
void checkForInstanceClassLoaderLeaks(int maxAllowableInstances, int loop) throws IOException, InterruptedException {
for (int i = 0; InstanceClassLoader.getApproximateLiveLoaderCount(true) > maxAllowableInstances && i < 120; i++) {
Uninterruptibles.sleepUninterruptibly(1, TimeUnit.SECONDS);
}
int approximateLiveLoaderCount = InstanceClassLoader.getApproximateLiveLoaderCount(true);
if (approximateLiveLoaderCount > maxAllowableInstances) {
dumpResources(String.format("InstanceClassLoader max reached at loop %d", loop));
Assert.assertThat("InstanceClassLoader leak detected",
approximateLiveLoaderCount,
lessThanOrEqualTo(maxAllowableInstances));
}
}
void doTest(int numClusterNodes, Consumer<IInstanceConfig> updater, Consumer<Cluster> actionToPerform) throws Throwable
void doTest(Consumer<IInstanceConfig> updater) throws Throwable
{
doTest(updater, ignored -> {}, false);
}
void doTest(Consumer<IInstanceConfig> updater, Consumer<Cluster> actionToPerform, boolean shouldCheckForClassloaderLeaks) throws Throwable
{
for (int loop = 0; loop < numTestLoops; loop++)
{
System.out.println(String.format("========== Starting loop %03d ========", loop));
try (Cluster cluster = (Cluster) builder().withNodes(numClusterNodes).withConfig(updater).start())
Cluster.Builder builder = builder();
try (Cluster cluster = (Cluster) builder.withNodes(numClusterNodes).withConfig(updater).start())
{
init(cluster);
String tableName = "tbl" + loop;
@ -191,87 +211,88 @@ public class ResourceLeakTest extends TestBaseImpl
{
dumpResources(String.format("loop%03d", loop));
}
// We add 2 to the number of allowed classloaders to provide some wiggle room, as GC is non-deterministic
// and some threads don't always shut down in time
if (shouldCheckForClassloaderLeaks)
{
checkForInstanceClassLoaderLeaks(numClusterNodes + 2, loop);
}
}
catch (AssertionError ae) {
throw ae;
}
catch (Throwable tr)
{
System.out.println("Dumping resources for exception: " + tr.getMessage());
System.out.println("Dumping resources for exception: " + tr);
tr.printStackTrace();
dumpResources("exception");
}
if (forceCollection)
{
System.runFinalization();
System.gc();
}
System.runFinalization();
System.gc();
System.out.println(String.format("========== Completed loop %03d ========", loop));
}
}
@Ignore("Only run if debugging an issue")
@Test
public void looperTest() throws Throwable
{
doTest(1, config -> {});
if (forceCollection)
{
System.runFinalization();
System.gc();
Thread.sleep(finalWaitMillis);
}
doTest(config -> {});
System.runFinalization();
System.gc();
Thread.sleep(finalWaitMillis);
dumpResources("final");
}
@Ignore("Only run if debugging an issue")
@Test
public void looperGossipNetworkTest() throws Throwable
{
doTest(2, config -> config.with(GOSSIP).with(NETWORK));
if (forceCollection)
{
System.runFinalization();
System.gc();
Thread.sleep(finalWaitMillis);
}
doTest(config -> config.with(GOSSIP).with(NETWORK));
System.runFinalization();
System.gc();
Thread.sleep(finalWaitMillis);
dumpResources("final-gossip-network");
}
@Ignore("Only run if debugging an issue")
@Test
public void looperNativeTest() throws Throwable
{
doTest(2, config -> config.with(NATIVE_PROTOCOL));
if (forceCollection)
{
System.runFinalization();
System.gc();
Thread.sleep(finalWaitMillis);
}
doTest(config -> config.with(NATIVE_PROTOCOL));
System.runFinalization();
System.gc();
Thread.sleep(finalWaitMillis);
dumpResources("final-native");
}
@Ignore("Only run if debugging an issue")
@Test
public void looperJmxTest() throws Throwable
{
doTest(2, config -> config.with(JMX), ResourceLeakTest::testJmx);
if (forceCollection)
{
System.runFinalization();
System.gc();
Thread.sleep(finalWaitMillis);
}
doTest(config -> config.with(JMX), ResourceLeakTest::testJmx, false);
System.runFinalization();
System.gc();
Thread.sleep(finalWaitMillis);
dumpResources("final-jmx");
}
/**
* This test is enabled in an attempt to automatically catch InstanceClassloader leaks.
* Depending on the type of leak, we may need to actually exercise functionality like JMX or Native
* beyond just enabling the feature, so we use the "everything" test even though it may take longer to run.
*/
@Test
public void looperEverythingTest() throws Throwable
{
doTest(2, config -> config.with(Feature.values()),
doTest(config -> config.with(NETWORK, GOSSIP, NATIVE_PROTOCOL, JMX), // Exclude `BLANK_GOSSIP`
cluster -> {
NativeProtocolTest.testNativeRequests(cluster);
testJmx(cluster);
});
if (forceCollection)
{
System.runFinalization();
System.gc();
Thread.sleep(finalWaitMillis);
}
}, true);
System.runFinalization();
System.gc();
Thread.sleep(finalWaitMillis);
dumpResources("final-everything");
}
}