mirror of
https://github.com/JetBrains/JetBrainsRuntime.git
synced 2025-12-06 09:29:38 +01:00
8312182: THPs cause huge RSS due to thread start timing issue
8310687: JDK-8303215 is incomplete
Backport-of: 84b325b844
This commit is contained in:
committed by
Vitaly Provodin
parent
92e7de91ee
commit
2f7800d8b1
@@ -89,6 +89,15 @@
|
||||
"to disable both the override and the printouts." \
|
||||
"See prctl(PR_SET_TIMERSLACK) for more info.") \
|
||||
\
|
||||
product(bool, DisableTHPStackMitigation, false, DIAGNOSTIC, \
|
||||
"If THPs are unconditionally enabled on the system (mode " \
|
||||
"\"always\"), the JVM will prevent THP from forming in " \
|
||||
"thread stacks. This switch disables that mitigation and " \
|
||||
"allows THPs to form in thread stacks.") \
|
||||
\
|
||||
develop(bool, DelayThreadStartALot, false, \
|
||||
"Artificially delay thread starts randomly for testing.") \
|
||||
\
|
||||
|
||||
|
||||
// end of RUNTIME_OS_FLAGS
|
||||
|
||||
@@ -775,6 +775,10 @@ static void *thread_native_entry(Thread *thread) {
|
||||
|
||||
assert(osthread->pthread_id() != 0, "pthread_id was not set as expected");
|
||||
|
||||
if (DelayThreadStartALot) {
|
||||
os::naked_short_sleep(100);
|
||||
}
|
||||
|
||||
// call one more level start routine
|
||||
thread->call_run();
|
||||
|
||||
@@ -911,6 +915,7 @@ bool os::create_thread(Thread* thread, ThreadType thr_type,
|
||||
// Calculate stack size if it's not specified by caller.
|
||||
size_t stack_size = os::Posix::get_initial_stack_size(thr_type, req_stack_size);
|
||||
size_t guard_size = os::Linux::default_guard_size(thr_type);
|
||||
|
||||
// Configure glibc guard page. Must happen before calling
|
||||
// get_static_tls_area_size(), which uses the guard_size.
|
||||
pthread_attr_setguardsize(&attr, guard_size);
|
||||
@@ -931,13 +936,16 @@ bool os::create_thread(Thread* thread, ThreadType thr_type,
|
||||
}
|
||||
assert(is_aligned(stack_size, os::vm_page_size()), "stack_size not aligned");
|
||||
|
||||
// Add an additional page to the stack size to reduce its chances of getting large page aligned
|
||||
// so that the stack does not get backed by a transparent huge page.
|
||||
size_t default_large_page_size = HugePages::default_static_hugepage_size();
|
||||
if (default_large_page_size != 0 &&
|
||||
stack_size >= default_large_page_size &&
|
||||
is_aligned(stack_size, default_large_page_size)) {
|
||||
stack_size += os::vm_page_size();
|
||||
if (!DisableTHPStackMitigation) {
|
||||
// In addition to the glibc guard page that prevents inter-thread-stack hugepage
|
||||
// coalescing (see comment in os::Linux::default_guard_size()), we also make
|
||||
// sure the stack size itself is not huge-page-size aligned; that makes it much
|
||||
// more likely for thread stack boundaries to be unaligned as well and hence
|
||||
// protects thread stacks from being targeted by khugepaged.
|
||||
if (HugePages::thp_pagesize() > 0 &&
|
||||
is_aligned(stack_size, HugePages::thp_pagesize())) {
|
||||
stack_size += os::vm_page_size();
|
||||
}
|
||||
}
|
||||
|
||||
int status = pthread_attr_setstacksize(&attr, stack_size);
|
||||
@@ -3081,6 +3089,27 @@ bool os::Linux::libnuma_init() {
|
||||
}
|
||||
|
||||
size_t os::Linux::default_guard_size(os::ThreadType thr_type) {
|
||||
|
||||
if (!DisableTHPStackMitigation) {
|
||||
// If THPs are unconditionally enabled, the following scenario can lead to huge RSS
|
||||
// - parent thread spawns, in quick succession, multiple child threads
|
||||
// - child threads are slow to start
|
||||
// - thread stacks of future child threads are adjacent and get merged into one large VMA
|
||||
// by the kernel, and subsequently transformed into huge pages by khugepaged
|
||||
// - child threads come up, place JVM guard pages, thus splinter the large VMA, splinter
|
||||
// the huge pages into many (still paged-in) small pages.
|
||||
// The result of that sequence are thread stacks that are fully paged-in even though the
|
||||
// threads did not even start yet.
|
||||
// We prevent that by letting the glibc allocate a guard page, which causes a VMA with different
|
||||
// permission bits to separate two ajacent thread stacks and therefore prevent merging stacks
|
||||
// into one VMA.
|
||||
//
|
||||
// Yes, this means we have two guard sections - the glibc and the JVM one - per thread. But the
|
||||
// cost for that one extra protected page is dwarfed from a large win in performance and memory
|
||||
// that avoiding interference by khugepaged buys us.
|
||||
return os::vm_page_size();
|
||||
}
|
||||
|
||||
// Creating guard page is very expensive. Java thread has HotSpot
|
||||
// guard pages, only enable glibc guard page for non-Java threads.
|
||||
// (Remember: compiler thread is a Java thread, too!)
|
||||
@@ -3744,6 +3773,21 @@ void os::large_page_init() {
|
||||
// Query OS information first.
|
||||
HugePages::initialize();
|
||||
|
||||
// If THPs are unconditionally enabled (THP mode "always"), khugepaged may attempt to
|
||||
// coalesce small pages in thread stacks to huge pages. That costs a lot of memory and
|
||||
// is usually unwanted for thread stacks. Therefore we attempt to prevent THP formation in
|
||||
// thread stacks unless the user explicitly allowed THP formation by manually disabling
|
||||
// -XX:+DisableTHPStackMitigation.
|
||||
if (HugePages::thp_mode() == THPMode::always) {
|
||||
if (DisableTHPStackMitigation) {
|
||||
log_info(pagesize)("JVM will *not* prevent THPs in thread stacks. This may cause high RSS.");
|
||||
} else {
|
||||
log_info(pagesize)("JVM will attempt to prevent THPs in thread stacks.");
|
||||
}
|
||||
} else {
|
||||
FLAG_SET_ERGO(DisableTHPStackMitigation, true); // Mitigation not needed
|
||||
}
|
||||
|
||||
// 1) Handle the case where we do not want to use huge pages
|
||||
if (!UseLargePages &&
|
||||
!UseTransparentHugePages &&
|
||||
|
||||
@@ -0,0 +1,241 @@
|
||||
/*
|
||||
* Copyright (c) 2023, Oracle and/or its affiliates. All rights reserved.
|
||||
* Copyright (c) 2023, Red Hat Inc.
|
||||
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
|
||||
*
|
||||
* This code is free software; you can redistribute it and/or modify it
|
||||
* under the terms of the GNU General Public License version 2 only, as
|
||||
* published by the Free Software Foundation.
|
||||
*
|
||||
* This code is distributed in the hope that it will be useful, but WITHOUT
|
||||
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
|
||||
* FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
|
||||
* version 2 for more details (a copy is included in the LICENSE file that
|
||||
* accompanied this code).
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License version
|
||||
* 2 along with this work; if not, write to the Free Software Foundation,
|
||||
* Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
|
||||
*
|
||||
* Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
|
||||
* or visit www.oracle.com if you need additional information or have any
|
||||
* questions.
|
||||
*/
|
||||
|
||||
/*
|
||||
* @test id=ENABLED
|
||||
* @bug 8303215 8312182
|
||||
* @summary On THP=always systems, we prevent THPs from forming within thread stacks
|
||||
* @library /test/lib
|
||||
* @requires os.family == "linux"
|
||||
* @requires vm.debug
|
||||
* @requires os.arch=="amd64" | os.arch=="x86_64" | os.arch=="aarch64"
|
||||
* @modules java.base/jdk.internal.misc
|
||||
* java.management
|
||||
* @run driver THPsInThreadStackPreventionTest PATCH-ENABLED
|
||||
*/
|
||||
|
||||
/*
|
||||
* @test id=DISABLED
|
||||
* @bug 8303215 8312182
|
||||
* @summary On THP=always systems, we prevent THPs from forming within thread stacks (negative test)
|
||||
* @library /test/lib
|
||||
* @requires os.family == "linux"
|
||||
* @requires vm.debug
|
||||
* @requires os.arch=="amd64" | os.arch=="x86_64" | os.arch=="aarch64"
|
||||
* @modules java.base/jdk.internal.misc
|
||||
* java.management
|
||||
* @run main/manual THPsInThreadStackPreventionTest PATCH-DISABLED
|
||||
*/
|
||||
import jdk.test.lib.process.OutputAnalyzer;
|
||||
import jdk.test.lib.process.ProcessTools;
|
||||
import jtreg.SkippedException;
|
||||
|
||||
import java.io.BufferedReader;
|
||||
import java.io.FileReader;
|
||||
import java.io.IOException;
|
||||
import java.util.ArrayList;
|
||||
import java.util.Arrays;
|
||||
import java.util.Objects;
|
||||
import java.util.concurrent.BrokenBarrierException;
|
||||
import java.util.concurrent.CyclicBarrier;
|
||||
|
||||
public class THPsInThreadStackPreventionTest {
|
||||
|
||||
// We test the mitigation for "huge rss for THP=always" introduced with JDK-8312182 and JDK-8302015:
|
||||
//
|
||||
// We start a program that spawns a ton of threads with a stack size close to THP page size. The threads
|
||||
// are idle and should not build up a lot of stack. The threads are started with an artificial delay
|
||||
// between thread start and stack guardpage creation, which exacerbates the RSS bloat (for explanation
|
||||
// please see 8312182).
|
||||
//
|
||||
// We then observe RSS of that program. We expect it to stay below a reasonable maximum. The unpatched
|
||||
// version should show an RSS of ~2 GB (paying for the fully paged in thread stacks). The fixed variant should
|
||||
// cost only ~200-400 MB.
|
||||
|
||||
static final int numThreads = 1000;
|
||||
static final long threadStackSizeMB = 2; // must be 2M
|
||||
static final long heapSizeMB = 64;
|
||||
static final long basicRSSOverheadMB = heapSizeMB + 150;
|
||||
// A successful completion of this test would show not more than X KB per thread stack.
|
||||
static final long acceptableRSSPerThreadStack = 128 * 1024;
|
||||
static final long acceptableRSSForAllThreadStacks = numThreads * acceptableRSSPerThreadStack;
|
||||
static final long acceptableRSSLimitMB = (acceptableRSSForAllThreadStacks / (1024 * 1024)) + basicRSSOverheadMB;
|
||||
|
||||
private static class TestMain {
|
||||
|
||||
static class Sleeper extends Thread {
|
||||
CyclicBarrier barrier;
|
||||
public Sleeper(CyclicBarrier barrier) {
|
||||
this.barrier = barrier;
|
||||
}
|
||||
@Override
|
||||
public void run() {
|
||||
try {
|
||||
barrier.await(); // wait for all siblings
|
||||
barrier.await(); // wait main thread to print status
|
||||
} catch (InterruptedException | BrokenBarrierException e) {
|
||||
e.printStackTrace();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
public static void main(String[] args) throws BrokenBarrierException, InterruptedException {
|
||||
|
||||
// Fire up 1000 threads with 2M stack size each.
|
||||
Sleeper[] threads = new Sleeper[numThreads];
|
||||
CyclicBarrier barrier = new CyclicBarrier(numThreads + 1);
|
||||
|
||||
for (int i = 0; i < numThreads; i++) {
|
||||
threads[i] = new Sleeper(barrier);
|
||||
threads[i].start();
|
||||
}
|
||||
|
||||
// Wait for all threads to come up
|
||||
barrier.await();
|
||||
|
||||
// print status
|
||||
String file = "/proc/self/status";
|
||||
try (FileReader fr = new FileReader(file);
|
||||
BufferedReader reader = new BufferedReader(fr)) {
|
||||
String line;
|
||||
while ((line = reader.readLine()) != null) {
|
||||
System.out.println(line);
|
||||
}
|
||||
} catch (IOException | NumberFormatException e) { /* ignored */ }
|
||||
|
||||
// Signal threads to stop
|
||||
barrier.await();
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
static class ProcSelfStatus {
|
||||
|
||||
public long rssMB;
|
||||
public long swapMB;
|
||||
public int numLifeThreads;
|
||||
|
||||
// Parse output from /proc/self/status
|
||||
public static ProcSelfStatus parse(OutputAnalyzer o) {
|
||||
ProcSelfStatus status = new ProcSelfStatus();
|
||||
String s = o.firstMatch("Threads:\\s*(\\d+)", 1);
|
||||
Objects.requireNonNull(s);
|
||||
status.numLifeThreads = Integer.parseInt(s);
|
||||
s = o.firstMatch("VmRSS:\\s*(\\d+) kB", 1);
|
||||
Objects.requireNonNull(s);
|
||||
status.rssMB = Long.parseLong(s) / 1024;
|
||||
s = o.firstMatch("VmSwap:\\s*(\\d+) kB", 1);
|
||||
Objects.requireNonNull(s);
|
||||
status.swapMB = Long.parseLong(s) / 1024;
|
||||
return status;
|
||||
}
|
||||
}
|
||||
|
||||
public static void main(String[] args) throws Exception {
|
||||
|
||||
HugePageConfiguration config = HugePageConfiguration.readFromOS();
|
||||
// This issue is bound to THP=always
|
||||
if (config.getThpMode() != HugePageConfiguration.THPMode.always) {
|
||||
throw new SkippedException("Test only makes sense in THP \"always\" mode");
|
||||
}
|
||||
|
||||
String[] defaultArgs = {
|
||||
"-Xlog:pagesize",
|
||||
"-Xmx" + heapSizeMB + "m", "-Xms" + heapSizeMB + "m", "-XX:+AlwaysPreTouch", // stabilize RSS
|
||||
"-Xss" + threadStackSizeMB + "m",
|
||||
"-XX:-CreateCoredumpOnCrash",
|
||||
// This will delay the child threads before they create guard pages, thereby greatly increasing the
|
||||
// chance of large VMA formation + hugepage coalescation; see JDK-8312182
|
||||
"-XX:+DelayThreadStartALot"
|
||||
};
|
||||
ArrayList<String> finalargs = new ArrayList<>(Arrays.asList(defaultArgs));
|
||||
|
||||
switch (args[0]) {
|
||||
case "PATCH-ENABLED": {
|
||||
finalargs.add(TestMain.class.getName());
|
||||
ProcessBuilder pb = ProcessTools.createJavaProcessBuilder(finalargs);
|
||||
|
||||
OutputAnalyzer output = new OutputAnalyzer(pb.start());
|
||||
output.shouldHaveExitValue(0);
|
||||
|
||||
// this line indicates the mitigation is active:
|
||||
output.shouldContain("[pagesize] JVM will attempt to prevent THPs in thread stacks.");
|
||||
|
||||
ProcSelfStatus status = ProcSelfStatus.parse(output);
|
||||
if (status.numLifeThreads < numThreads) {
|
||||
throw new RuntimeException("Number of live threads lower than expected: " + status.numLifeThreads + ", expected " + numThreads);
|
||||
} else {
|
||||
System.out.println("Found " + status.numLifeThreads + " to be alive. Ok.");
|
||||
}
|
||||
|
||||
long rssPlusSwapMB = status.swapMB + status.rssMB;
|
||||
|
||||
if (rssPlusSwapMB > acceptableRSSLimitMB) {
|
||||
throw new RuntimeException("RSS+Swap larger than expected: " + rssPlusSwapMB + "m, expected at most " + acceptableRSSLimitMB + "m");
|
||||
} else {
|
||||
if (rssPlusSwapMB < heapSizeMB) { // we pretouch the java heap, so we expect to see at least that:
|
||||
throw new RuntimeException("RSS+Swap suspiciously low: " + rssPlusSwapMB + "m, expected at least " + heapSizeMB + "m");
|
||||
}
|
||||
System.out.println("Okay: RSS+Swap=" + rssPlusSwapMB + ", within acceptable limit of " + acceptableRSSLimitMB);
|
||||
}
|
||||
}
|
||||
break;
|
||||
|
||||
case "PATCH-DISABLED": {
|
||||
|
||||
// Only execute manually! this will allocate ~2gb of memory!
|
||||
|
||||
// explicitly disable the no-THP-workaround:
|
||||
finalargs.add("-XX:+UnlockDiagnosticVMOptions");
|
||||
finalargs.add("-XX:+DisableTHPStackMitigation");
|
||||
|
||||
finalargs.add(TestMain.class.getName());
|
||||
ProcessBuilder pb = ProcessTools.createJavaProcessBuilder(finalargs);
|
||||
OutputAnalyzer output = new OutputAnalyzer(pb.start());
|
||||
|
||||
output.shouldHaveExitValue(0);
|
||||
|
||||
// We deliberately switched off mitigation, VM should tell us:
|
||||
output.shouldContain("[pagesize] JVM will *not* prevent THPs in thread stacks. This may cause high RSS.");
|
||||
|
||||
// Parse output from self/status
|
||||
ProcSelfStatus status = ProcSelfStatus.parse(output);
|
||||
if (status.numLifeThreads < numThreads) {
|
||||
throw new RuntimeException("Number of live threads lower than expected (" + status.numLifeThreads + ", expected " + numThreads +")");
|
||||
} else {
|
||||
System.out.println("Found " + status.numLifeThreads + " to be alive. Ok.");
|
||||
}
|
||||
|
||||
long rssPlusSwapMB = status.swapMB + status.rssMB;
|
||||
|
||||
if (rssPlusSwapMB < acceptableRSSLimitMB) {
|
||||
throw new RuntimeException("RSS+Swap lower than expected: " + rssPlusSwapMB + "m, expected more than " + acceptableRSSLimitMB + "m");
|
||||
}
|
||||
break;
|
||||
}
|
||||
|
||||
default: throw new RuntimeException("Bad argument: " + args[0]);
|
||||
}
|
||||
}
|
||||
}
|
||||
Reference in New Issue
Block a user