Skip to content
Open
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions CHANGES.txt
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
5.1
* GCInspector should use different thresholds on GC events for warning messages (CASSANDRA-20980)
* Add cqlsh autocompletion for the identity mapping feature (CASSANDRA-20021)
* Add DDL Guardrail enabling administrators to disallow creation/modification of keyspaces with durable_writes = false (CASSANDRA-20913)
* Optimize Counter, Meter and Histogram metrics using thread local counters (CASSANDRA-20250)
Expand Down
16 changes: 16 additions & 0 deletions conf/cassandra.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -2048,6 +2048,11 @@ batch_size_fail_threshold: 50KiB
# Log WARN on any batches not of type LOGGED than span across more partitions than this limit
unlogged_batch_across_partitions_warn_threshold: 10

# GCInspector configs:
# For GC like ShenandoahGC/ZGC etc., there are GC events that do not have STW pauses (Concurrent phases)
# Operator might find it reasonable to use lower thresholds for events require STW pauses and higher thresholds
# for concurrent phases.
#
# GC Pauses greater than 200 ms will be logged at INFO level
# This threshold can be adjusted to minimize logging if necessary
# Min unit: ms
Expand All @@ -2059,6 +2064,17 @@ unlogged_batch_across_partitions_warn_threshold: 10
# Min unit: ms
# gc_warn_threshold: 1000ms

# GC Concurrent phase greater than 200 ms will be logged at INFO level
Copy link
Contributor

@netudima netudima Oct 20, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

the value (200) in the description does not match the value in the commented value lower (1000)

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

updated - thanks for catching this!

# This threshold can be adjusted to minimize logging if necessary
# Min unit: ms
# gc_concurrent_phase_log_threshold: 1000ms

# GC Concurrent phase than gc_concurrent_phase_warn_threshold will be logged at WARN level
# Adjust the threshold based on your application throughput requirement. Setting to 0
# will deactivate the feature.
# Min unit: ms
# gc_concurrent_phase_warn_threshold: 2000ms

# Maximum size of any value in SSTables. Safety measure to detect SSTable corruption
# early. Any value size larger than this threshold will result into marking an SSTable
# as corrupted. This should be positive and less than 2GiB.
Expand Down
16 changes: 16 additions & 0 deletions conf/cassandra_latest.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -1913,6 +1913,11 @@ batch_size_fail_threshold: 50KiB
# Log WARN on any batches not of type LOGGED than span across more partitions than this limit
unlogged_batch_across_partitions_warn_threshold: 10

# GCInspector configs:
# For GC like ShenandoahGC/ZGC etc., there are GC events that do not have STW pauses.
# Such events are called Concurrent phases. Operator might find it reasonable to use lower thresholds
# for events require STW pauses and higher thresholds for concurrent phases.
#
# GC Pauses greater than 200 ms will be logged at INFO level
# This threshold can be adjusted to minimize logging if necessary
# Min unit: ms
Expand All @@ -1924,6 +1929,17 @@ unlogged_batch_across_partitions_warn_threshold: 10
# Min unit: ms
# gc_warn_threshold: 1000ms

# GC Concurrent phase greater than 200 ms will be logged at INFO level
# This threshold can be adjusted to minimize logging if necessary
# Min unit: ms
# gc_concurrent_phase_log_threshold: 1000ms

# GC Concurrent phase than gc_concurrent_phase_warn_threshold will be logged at WARN level
# Adjust the threshold based on your application throughput requirement. Setting to 0
# will deactivate the feature.
# Min unit: ms
# gc_concurrent_phase_warn_threshold: 2000ms

# Maximum size of any value in SSTables. Safety measure to detect SSTable corruption
# early. Any value size larger than this threshold will result into marking an SSTable
# as corrupted. This should be positive and less than 2GiB.
Expand Down
2 changes: 2 additions & 0 deletions src/java/org/apache/cassandra/config/Config.java
Original file line number Diff line number Diff line change
Expand Up @@ -582,6 +582,8 @@ public static class SSTableConfig
public volatile DurationSpec.IntMillisecondsBound gc_log_threshold = new DurationSpec.IntMillisecondsBound("200ms");
@Replaces(oldName = "gc_warn_threshold_in_ms", converter = Converters.MILLIS_DURATION_INT, deprecated = true)
public volatile DurationSpec.IntMillisecondsBound gc_warn_threshold = new DurationSpec.IntMillisecondsBound("1s");
public volatile DurationSpec.IntMillisecondsBound gc_concurrent_phase_log_threshold = new DurationSpec.IntMillisecondsBound("1s");
public volatile DurationSpec.IntMillisecondsBound gc_concurrent_phase_warn_threshold = new DurationSpec.IntMillisecondsBound("2s");

// TTL for different types of trace events.
@Replaces(oldName = "tracetype_query_ttl", converter = Converters.SECONDS_DURATION, deprecated=true)
Expand Down
21 changes: 21 additions & 0 deletions src/java/org/apache/cassandra/config/DatabaseDescriptor.java
Original file line number Diff line number Diff line change
Expand Up @@ -4698,6 +4698,27 @@ public static void setGCWarnThreshold(int threshold)
conf.gc_warn_threshold = new DurationSpec.IntMillisecondsBound(threshold);
}

public static int getGCConcurrentPhaseLogThreshold()
{
return conf.gc_concurrent_phase_log_threshold.toMilliseconds();
}

public static void setGCConcurrentPhaseLogThreshold(int threshold)
{
conf.gc_concurrent_phase_log_threshold = new DurationSpec.IntMillisecondsBound(threshold);
}


public static int getGCConcurrentPhaseWarnThreshold()
{
return conf.gc_concurrent_phase_warn_threshold.toMilliseconds();
}

public static void setGCConcurrentPhaseWarnThreshold(int threshold)
{
conf.gc_concurrent_phase_warn_threshold = new DurationSpec.IntMillisecondsBound(threshold);
}

public static boolean isCDCEnabled()
{
return conf.cdc_enabled;
Expand Down
82 changes: 73 additions & 9 deletions src/java/org/apache/cassandra/service/GCInspector.java
Original file line number Diff line number Diff line change
Expand Up @@ -287,23 +287,53 @@ public void handleNotification(final Notification notification, final Object han
if (state.compareAndSet(prev, new State(duration, bytes, prev)))
break;
}

if (getGcWarnThresholdInMs() != 0 && duration > getGcWarnThresholdInMs())
logger.warn(sb.toString());
else if (duration > getGcLogThresholdInMs())
logger.info(sb.toString());
else if (logger.isTraceEnabled())
logger.trace(sb.toString());

if (duration > this.getStatusThresholdInMs())
StatusLogger.log();
if (isConcurrentPhase(info.getGcCause(), info.getGcName()))
{
if (getGcWarnThresholdInMs() != 0 && duration > getGcWarnThresholdInMs())
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

if it is a concurrent phase = true branch why do we use non-concurrent thresholds here?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

updated - thanks for catching this!

logger.warn(sb.toString());
else if (duration > getGcLogThresholdInMs())
logger.info(sb.toString());
else if (logger.isTraceEnabled())
logger.trace(sb.toString());
// TODO: trigger StatusLogger if concurrent phases take too long?
}
else
{
if (getGcConcurrentPhaseWarnThresholdInMs() != 0 && duration > getGcConcurrentPhaseWarnThresholdInMs())
logger.warn(sb.toString());
else if (duration > getGcConcurrentPhaseLogThresholdInMs())
logger.info(sb.toString());
else if (logger.isTraceEnabled())
logger.trace(sb.toString());
if (duration > this.getStatusThresholdInMs())
StatusLogger.log();
}

// if we just finished an old gen collection and we're still using a lot of memory, try to reduce the pressure
if (gcState.assumeGCIsOldGen)
LifecycleTransaction.rescheduleFailedDeletions();
}
}

static boolean isConcurrentPhase(String cause, String name) {
// Mostly taken from: https://github.com/Netflix/spectator/blob/v1.7.x/spectator-ext-gc/src/main/java/com/netflix/spectator/gc/GcLogger.java
// So far the only indicator known is that the cause will be reported as "No GC"
// when using CMS.
//
// For ZGC, behavior was changed in JDK17: https://bugs.openjdk.java.net/browse/JDK-8265136
// For ZGC in older versions, there is no way to accurately get the amount of time
// in STW pauses.
//
// For G1, a new bean is added in JDK20 to indicate time spent in concurrent phases:
// https://bugs.openjdk.org/browse/JDK-8297247

return "No GC".equals(cause) // CMS
|| "G1 Concurrent GC".equals(name) // G1 in JDK20+
|| name.endsWith(" Cycles"); // Shenandoah, ZGC
}


public State getTotalSinceLastCheck()
{
return state.getAndSet(new State());
Expand Down Expand Up @@ -407,6 +437,40 @@ public void setGcLogThresholdInMs(long threshold)
DatabaseDescriptor.setGCLogThreshold((int) threshold);
}

public int getGcConcurrentPhaseWarnThresholdInMs()
{
return DatabaseDescriptor.getGCConcurrentPhaseWarnThreshold();
}

public void setGcConcurrentPhaseWarnThresholdInMs(int threshold)
{
long gcConcurrentPhaseLogThresholdInMs = getGcConcurrentPhaseLogThresholdInMs();
if (threshold < 0)
throw new IllegalArgumentException("Threshold must be greater than or equal to 0");
if (threshold != 0 && threshold <= gcConcurrentPhaseLogThresholdInMs)
throw new IllegalArgumentException("Threshold must be greater than gcConcurrentPhaseLogThresholdInMs which is currently "
+ gcConcurrentPhaseLogThresholdInMs);
DatabaseDescriptor.setGCConcurrentPhaseWarnThreshold(threshold);
}

public int getGcConcurrentPhaseLogThresholdInMs()
{
return DatabaseDescriptor.getGCConcurrentPhaseLogThreshold();
}

public void setGcConcurrentPhaseLogThresholdInMs(int threshold)
{
if (threshold <= 0)
throw new IllegalArgumentException("Threshold must be greater than 0");

long gcConcurrentPhaseWarnThresholdInMs = getGcConcurrentPhaseWarnThresholdInMs();
if (gcConcurrentPhaseWarnThresholdInMs != 0 && threshold > gcConcurrentPhaseWarnThresholdInMs)
throw new IllegalArgumentException("Threshold must be less than gcConcurrentPhaseWarnThresholdInMs which is currently "
+ gcConcurrentPhaseWarnThresholdInMs);

DatabaseDescriptor.setGCConcurrentPhaseLogThreshold(threshold);
}

public long getGcLogThresholdInMs()
{
return DatabaseDescriptor.getGCLogThreshold();
Expand Down
4 changes: 4 additions & 0 deletions src/java/org/apache/cassandra/service/GCInspectorMXBean.java
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,10 @@ public interface GCInspectorMXBean
void setGcWarnThresholdInMs(long threshold);
long getGcWarnThresholdInMs();
void setGcLogThresholdInMs(long threshold);
int getGcConcurrentPhaseLogThresholdInMs();
void setGcConcurrentPhaseWarnThresholdInMs(int threshold);
int getGcConcurrentPhaseWarnThresholdInMs();
void setGcConcurrentPhaseLogThresholdInMs(int threshold);
long getGcLogThresholdInMs();
long getStatusThresholdInMs();
}
104 changes: 87 additions & 17 deletions test/unit/org/apache/cassandra/service/GCInspectorTest.java
Original file line number Diff line number Diff line change
Expand Up @@ -18,11 +18,16 @@
package org.apache.cassandra.service;

import org.apache.cassandra.config.DatabaseDescriptor;
import org.junit.Assert;
import org.junit.Before;
import org.junit.BeforeClass;
import org.junit.Test;

import static org.junit.Assert.assertEquals;
import static org.junit.Assert.assertFalse;
import static org.junit.Assert.assertTrue;
import static org.junit.Assert.fail;


public class GCInspectorTest
{

Expand All @@ -43,49 +48,98 @@ public void before()
@Test
public void ensureStaticFieldsHydrateFromConfig()
{
Assert.assertEquals(DatabaseDescriptor.getGCLogThreshold(), gcInspector.getGcLogThresholdInMs());
Assert.assertEquals(DatabaseDescriptor.getGCWarnThreshold(), gcInspector.getGcWarnThresholdInMs());
assertEquals(DatabaseDescriptor.getGCLogThreshold(), gcInspector.getGcLogThresholdInMs());
assertEquals(DatabaseDescriptor.getGCWarnThreshold(), gcInspector.getGcWarnThresholdInMs());
assertEquals(DatabaseDescriptor.getGCConcurrentPhaseLogThreshold(), gcInspector.getGcConcurrentPhaseLogThresholdInMs());
assertEquals(DatabaseDescriptor.getGCConcurrentPhaseWarnThreshold(), gcInspector.getGcConcurrentPhaseWarnThresholdInMs());
}

@Test
public void ensureStatusIsCalculated()
{
Assert.assertTrue(gcInspector.getStatusThresholdInMs() > 0);
assertTrue(gcInspector.getStatusThresholdInMs() > 0);
}

@Test(expected=IllegalArgumentException.class)
@Test
public void ensureWarnGreaterThanLog()
{
gcInspector.setGcWarnThresholdInMs(gcInspector.getGcLogThresholdInMs());
try
{
gcInspector.setGcWarnThresholdInMs(gcInspector.getGcLogThresholdInMs());
fail("Expect IllegalArgumentException");
}
catch (IllegalArgumentException e)
{
// expected
}
try
{
gcInspector.setGcConcurrentPhaseWarnThresholdInMs(gcInspector.getGcConcurrentPhaseLogThresholdInMs());
fail("Expect IllegalArgumentException");
}
catch (IllegalArgumentException e)
{
// expected
}
}

@Test
public void ensureZeroIsOk()
{
gcInspector.setGcWarnThresholdInMs(0);
Assert.assertEquals(gcInspector.getStatusThresholdInMs(), gcInspector.getGcLogThresholdInMs());
Assert.assertEquals(0, DatabaseDescriptor.getGCWarnThreshold());
Assert.assertEquals(200, DatabaseDescriptor.getGCLogThreshold());
assertEquals(gcInspector.getStatusThresholdInMs(), gcInspector.getGcLogThresholdInMs());
assertEquals(0, DatabaseDescriptor.getGCWarnThreshold());
assertEquals(200, DatabaseDescriptor.getGCLogThreshold());

gcInspector.setGcConcurrentPhaseWarnThresholdInMs(0);
assertEquals(0, DatabaseDescriptor.getGCConcurrentPhaseWarnThreshold());
assertEquals(1000, DatabaseDescriptor.getGCConcurrentPhaseLogThreshold());
}

@Test(expected=IllegalArgumentException.class)
@Test
public void ensureLogLessThanWarn()
{
Assert.assertEquals(200, gcInspector.getGcLogThresholdInMs());
assertEquals(200, gcInspector.getGcLogThresholdInMs());
gcInspector.setGcWarnThresholdInMs(1000);
Assert.assertEquals(1000, gcInspector.getGcWarnThresholdInMs());
gcInspector.setGcLogThresholdInMs(gcInspector.getGcWarnThresholdInMs() + 1);
assertEquals(1000, gcInspector.getGcWarnThresholdInMs());
try
{
gcInspector.setGcLogThresholdInMs(gcInspector.getGcWarnThresholdInMs() + 1);
fail("Expect IllegalArgumentException");
}
catch (IllegalArgumentException e)
{
// expected
}
assertEquals(1000, gcInspector.getGcConcurrentPhaseLogThresholdInMs());
gcInspector.setGcConcurrentPhaseWarnThresholdInMs(2000);
assertEquals(2000, gcInspector.getGcConcurrentPhaseWarnThresholdInMs());
try
{
gcInspector.setGcConcurrentPhaseLogThresholdInMs(gcInspector.getGcConcurrentPhaseWarnThresholdInMs() + 1);
fail("Expect IllegalArgumentException");
}
catch (IllegalArgumentException e)
{
// expected
}
}

@Test
public void testDefaults()
{
gcInspector.setGcLogThresholdInMs(200);
gcInspector.setGcWarnThresholdInMs(1000);
Assert.assertEquals(200, DatabaseDescriptor.getGCLogThreshold());
Assert.assertEquals(200, gcInspector.getGcLogThresholdInMs());
Assert.assertEquals(1000, DatabaseDescriptor.getGCWarnThreshold());
Assert.assertEquals(1000, gcInspector.getGcWarnThresholdInMs());
gcInspector.setGcConcurrentPhaseLogThresholdInMs(1000);
gcInspector.setGcConcurrentPhaseWarnThresholdInMs(2000);
assertEquals(200, DatabaseDescriptor.getGCLogThreshold());
assertEquals(200, gcInspector.getGcLogThresholdInMs());
assertEquals(1000, DatabaseDescriptor.getGCWarnThreshold());
assertEquals(1000, gcInspector.getGcWarnThresholdInMs());
assertEquals(1000, DatabaseDescriptor.getGCConcurrentPhaseLogThreshold());
assertEquals(1000, gcInspector.getGcConcurrentPhaseLogThresholdInMs());
assertEquals(2000, DatabaseDescriptor.getGCConcurrentPhaseWarnThreshold());
assertEquals(2000, gcInspector.getGcConcurrentPhaseWarnThresholdInMs());
}

@Test(expected=IllegalArgumentException.class)
Expand All @@ -94,4 +148,20 @@ public void testMaxValue()
gcInspector.setGcLogThresholdInMs(200);
gcInspector.setGcWarnThresholdInMs(Integer.MAX_VALUE+1L);
}

@Test
public void testIsConcurrentPhase()
{
assertTrue("No GC cause should be considered concurrent",
GCInspector.isConcurrentPhase("No GC", "SomeGCName"));
assertTrue("Shenandoah Cycles should be considered concurrent",
GCInspector.isConcurrentPhase("SomeCause", "Shenandoah Cycles"));
assertTrue("ZGC Cycles should be considered concurrent",
GCInspector.isConcurrentPhase("SomeCause", "ZGC Cycles"));

assertFalse("ParallelGC should not be considered concurrent",
GCInspector.isConcurrentPhase("Allocation Failure", "PS Scavenge"));
assertFalse("G1 Young Generation should not be considered concurrent",
GCInspector.isConcurrentPhase("G1 Evacuation Pause", "G1 Young Generation"));
}
}