Add analysis why the memory request failed
This commit is contained in:
parent
beb00ce17e
commit
ec8b4c2374
@ -68,6 +68,22 @@ THIS PROGRAM COMES WITH NO WARRANTY
|
||||
display: block;
|
||||
}
|
||||
|
||||
.js-alloc-failure--show {
|
||||
/* empty - used to hide/show detailed allocation failure analysis */
|
||||
}
|
||||
|
||||
.js-alloc-failure-below-low-watermark--show {
|
||||
/* empty - used to hide/show details for failed memory allocations */
|
||||
}
|
||||
|
||||
.js-alloc-failure-no-free-chunks--show {
|
||||
/* empty - used to hide/show details for failed memory allocations */
|
||||
}
|
||||
|
||||
.js-alloc-failure-unknown-reason-show {
|
||||
/* empty - used to hide/show details for failed memory allocations */
|
||||
}
|
||||
|
||||
.js-killed-proc-score--show {
|
||||
/* empty - used to hide/show OOM score of killed process */
|
||||
}
|
||||
@ -378,16 +394,6 @@ window.onerror = function (msg, url, lineNo, columnNo, errorObj) {
|
||||
The OOM killer calculates a score for each process and terminates the process with the highest score.
|
||||
</p>
|
||||
</div>
|
||||
<p>
|
||||
The process "<span class="killed_proc_name"></span>"
|
||||
(PID <span class="killed_proc_pid"></span>)
|
||||
<span class="js-text--default-show js-killed-proc-score--show">
|
||||
with an OOM score of <span class="killed_proc_score"></span>
|
||||
</span>
|
||||
has been terminated. It uses <span class="killed_proc_rss_percent"></span>
|
||||
(<span class="killed_proc_total_rss_kb"></span>) of the resident memory.
|
||||
</p>
|
||||
|
||||
<div class="js-text--default-show js-swap-active--show">
|
||||
<p>
|
||||
The system has <span class="system_total_ram_kb"></span> physical memory and
|
||||
@ -406,6 +412,36 @@ window.onerror = function (msg, url, lineNo, columnNo, errorObj) {
|
||||
physical memory are in use.
|
||||
</p>
|
||||
</div>
|
||||
<p>
|
||||
The process "<span class="killed_proc_name"></span>"
|
||||
(PID <span class="killed_proc_pid"></span>)
|
||||
<span class="js-text--default-show js-killed-proc-score--show">
|
||||
with an OOM score of <span class="killed_proc_score"></span>
|
||||
</span>
|
||||
has been terminated. It uses <span class="killed_proc_rss_percent"></span>
|
||||
(<span class="killed_proc_total_rss_kb"></span>) of the resident memory.
|
||||
</p>
|
||||
<div class="js-text--default-hide js-alloc-failure--show">
|
||||
<p>
|
||||
A memory chunk of order <span class="trigger_proc_order"></span> (
|
||||
<span class="trigger_proc_requested_memory_pages"></span> /
|
||||
<span class="trigger_proc_requested_memory_pages_kb"></span> ) has been requested from the
|
||||
"<span class="trigger_proc_mem_zone"></span>" memory zone.
|
||||
|
||||
<span class="js-text--default-hide js-alloc-failure-below-low-watermark--show">
|
||||
The request failed because after its fulfillment the free memory would be below the memory
|
||||
low watermark.
|
||||
</span>
|
||||
<span class="js-text--default-hide js-alloc-failure-no-free-chunks--show">
|
||||
If this requirement were met, the free memory would still be above the low memory watermark.
|
||||
The request failed because there is no free chunk in the current or higher order.
|
||||
</span>
|
||||
<span class="js-text--default-hide js-alloc-failure-unknown-reason-show">
|
||||
The request failed, but the reason is unknown.
|
||||
</span>
|
||||
This analysis result is an estimate because the kernel reduces the minimum watermark in some rare cases.
|
||||
</p>
|
||||
</div>
|
||||
</div>
|
||||
|
||||
<h3>Details of analysis</h3>
|
||||
@ -955,6 +991,7 @@ window.onerror = function (msg, url, lineNo, columnNo, errorObj) {
|
||||
<li>Fix and rework calculation of GFP flags</li>
|
||||
<li>Add GFP flags for more kernel releases</li>
|
||||
<li>Display missing memory chunks (buddyinfo) again</li>
|
||||
<li>Add analysis why the memory request failed</li>
|
||||
<li>...</li>
|
||||
</ol>
|
||||
|
||||
|
243
OOMAnalyser.py
243
OOMAnalyser.py
@ -132,6 +132,28 @@ class OOMEntityType:
|
||||
manual = 2
|
||||
|
||||
|
||||
class OOMMemoryAllocFailureType:
|
||||
"""Enum to store the results why the memory allocation could have failed"""
|
||||
|
||||
not_started = 0
|
||||
"""Analysis not started"""
|
||||
|
||||
missing_data = 1
|
||||
"""Missing data to start analysis"""
|
||||
|
||||
failed_below_low_watermark = 2
|
||||
"""Failed, because after satisfying this request, the free memory will be below the low memory watermark"""
|
||||
|
||||
failed_no_free_chunks = 3
|
||||
"""Failed, because no suitable chunk is free in the current or any higher order."""
|
||||
|
||||
failed_unknown_reason = 4
|
||||
"""Failed, but the reason is unknown"""
|
||||
|
||||
skipped_high_order_dont_trigger_oom = 5
|
||||
""""high order" requests don't trigger OOM"""
|
||||
|
||||
|
||||
def is_visible(element):
|
||||
return element.offsetWidth > 0 and element.offsetHeight > 0
|
||||
|
||||
@ -484,6 +506,11 @@ class BaseKernelConfig:
|
||||
]
|
||||
"""Elements of the process table"""
|
||||
|
||||
PAGE_ALLOC_COSTLY_ORDER = 3
|
||||
"""
|
||||
Requests with order > PAGE_ALLOC_COSTLY_ORDER will never trigger the OOM-killer to satisfy the request.
|
||||
"""
|
||||
|
||||
pstable_html = [
|
||||
"PID",
|
||||
"UID",
|
||||
@ -549,6 +576,13 @@ class BaseKernelConfig:
|
||||
:type: str
|
||||
"""
|
||||
|
||||
ZONE_TYPES = ["DMA", "DMA32", "Normal", "HighMem", "Movable"]
|
||||
"""
|
||||
List of memory zones
|
||||
|
||||
@type: List(str)
|
||||
"""
|
||||
|
||||
def __init__(self):
|
||||
super().__init__()
|
||||
|
||||
@ -2670,26 +2704,9 @@ class OOMEntity:
|
||||
class OOMResult:
|
||||
"""Results of an OOM analysis"""
|
||||
|
||||
kconfig = BaseKernelConfig()
|
||||
"""Kernel configuration"""
|
||||
|
||||
details = {}
|
||||
"""Extracted result"""
|
||||
|
||||
oom_entity = None
|
||||
"""
|
||||
State of this OOM (unknown, incomplete, ...)
|
||||
|
||||
:type: OOMEntityState
|
||||
"""
|
||||
|
||||
oom_type = OOMEntityType.unknown
|
||||
"""
|
||||
Type of this OOM (manually or automatically triggered)
|
||||
|
||||
:type: OOMEntityType
|
||||
"""
|
||||
|
||||
error_msg = ""
|
||||
"""
|
||||
Error message
|
||||
@ -2697,6 +2714,9 @@ class OOMResult:
|
||||
@type: str
|
||||
"""
|
||||
|
||||
kconfig = BaseKernelConfig()
|
||||
"""Kernel configuration"""
|
||||
|
||||
kversion = None
|
||||
"""
|
||||
Kernel version
|
||||
@ -2704,6 +2724,19 @@ class OOMResult:
|
||||
@type: str
|
||||
"""
|
||||
|
||||
mem_alloc_failure = OOMMemoryAllocFailureType.not_started
|
||||
"""State/result of the memory allocation failure analysis
|
||||
|
||||
@see: OOMAnalyser._analyse_alloc_failure()
|
||||
"""
|
||||
|
||||
oom_entity = None
|
||||
"""
|
||||
State of this OOM (unknown, incomplete, ...)
|
||||
|
||||
:type: OOMEntityState
|
||||
"""
|
||||
|
||||
oom_text = None
|
||||
"""
|
||||
OOM text
|
||||
@ -2711,6 +2744,13 @@ class OOMResult:
|
||||
@type: str
|
||||
"""
|
||||
|
||||
oom_type = OOMEntityType.unknown
|
||||
"""
|
||||
Type of this OOM (manually or automatically triggered)
|
||||
|
||||
:type: OOMEntityType
|
||||
"""
|
||||
|
||||
swap_active = False
|
||||
"""
|
||||
Swap space active or inactive
|
||||
@ -3063,7 +3103,8 @@ class OOMAnalyser:
|
||||
Extract memory watermark information from all zones
|
||||
|
||||
This function fills:
|
||||
* OOMResult.details["_watermarks"] with [<zone>][<node>][(free|min|low|high)] = <XXX>
|
||||
* OOMResult.details["_watermarks"] with [<zone>][<node>][(free|min|low|high)] = int
|
||||
* OOMResult.details["_watermarks"] with [<zone>][<node>][(lowmem_reserve)] = List(int)
|
||||
"""
|
||||
self.oom_result.details["_watermarks"] = {}
|
||||
watermark_info = self.oom_result.details["_watermarks"]
|
||||
@ -3075,9 +3116,16 @@ class OOMAnalyser:
|
||||
# Therefore, we reset the counter by one line.
|
||||
self.oom_entity.back()
|
||||
|
||||
node = None
|
||||
zone = None
|
||||
for line in self.oom_entity:
|
||||
match = self.REC_WATERMARK.match(line)
|
||||
if not match:
|
||||
if line.startswith("lowmem_reserve[]:"):
|
||||
# zone and node are defined in the previous round
|
||||
watermark_info[zone][node]["lowmem_reserve"] = [
|
||||
int(v) for v in line.split()[1:]
|
||||
]
|
||||
continue
|
||||
|
||||
node = int(match.group("node"))
|
||||
@ -3089,6 +3137,32 @@ class OOMAnalyser:
|
||||
for i in ["free", "min", "low", "high"]:
|
||||
watermark_info[zone][node][i] = int(match.group(i))
|
||||
|
||||
def _extract_node_from_watermarks(self, zone):
|
||||
"""
|
||||
Search node with memory shortage: watermark "free" < "min"
|
||||
|
||||
@param str zone: Requested zone
|
||||
@return: First node with memory shortage or None if no node found
|
||||
@rtype: None|int
|
||||
"""
|
||||
watermark_info = self.oom_result.details["_watermarks"]
|
||||
if zone not in watermark_info:
|
||||
debug(
|
||||
"Missing watermark info for zone {} - skip memory analysis".format(zone)
|
||||
)
|
||||
return None
|
||||
# __pragma__ ('jsiter')
|
||||
for node in watermark_info[zone]:
|
||||
if watermark_info[zone][node]["free"] < watermark_info[zone][node]["min"]:
|
||||
return int(node)
|
||||
# __pragma__ ('nojsiter')
|
||||
|
||||
debug(
|
||||
"Node with current memory shortage cannot be determined - skip memory analysis"
|
||||
)
|
||||
|
||||
return None
|
||||
|
||||
def _gfp_hex2flags(self, hexvalue):
|
||||
"""\
|
||||
Convert the hexadecimal value into flags specified by definition
|
||||
@ -3170,6 +3244,114 @@ class OOMAnalyser:
|
||||
ps_index.sort(key=int)
|
||||
self.oom_result.details["_pstable_index"] = ps_index
|
||||
|
||||
def _check_free_chunks(self, start_with_order, zone, node):
|
||||
"""Check for at least one free chunk in the current or any higher order.
|
||||
|
||||
Returns True, if at lease one suitable chunk is free.
|
||||
Returns None, if buddyinfo doesn't contain information for the requested node, order or zone
|
||||
|
||||
@param int start_with_order: Start checking with this order
|
||||
@param str zone: Memory zone
|
||||
@param int node: Node number
|
||||
@rtype: None|bool
|
||||
"""
|
||||
if not self.oom_result.details["_buddyinfo"]:
|
||||
return None
|
||||
buddyinfo = self.oom_result.details["_buddyinfo"]
|
||||
if zone not in buddyinfo:
|
||||
return None
|
||||
|
||||
for order in range(start_with_order, self.oom_result.kconfig.MAX_ORDER):
|
||||
if order not in buddyinfo[zone]:
|
||||
break
|
||||
if node not in buddyinfo[zone][order]:
|
||||
return None
|
||||
free_chunks = buddyinfo[zone][order][node]
|
||||
if free_chunks:
|
||||
return True
|
||||
return False
|
||||
|
||||
def _analyse_alloc_failure(self):
|
||||
"""
|
||||
Analyse why the memory allocation could be failed.
|
||||
|
||||
The code in this function is inspired by mm/page_alloc.c:__zone_watermark_ok()
|
||||
"""
|
||||
self.oom_result.mem_alloc_failure = OOMMemoryAllocFailureType.not_started
|
||||
|
||||
if self.oom_result.oom_type == OOMEntityType.manual:
|
||||
debug("OOM triggered manually - skip memory analysis")
|
||||
return
|
||||
if "_buddyinfo" not in self.oom_result.details:
|
||||
debug("Missing buddyinfo - skip memory analysis")
|
||||
return
|
||||
if not self.oom_result.details["_buddyinfo"]:
|
||||
debug("Empty buddyinfo - skip memory analysis")
|
||||
return
|
||||
if ("trigger_proc_order" not in self.oom_result.details) or (
|
||||
"trigger_proc_mem_zone" not in self.oom_result.details
|
||||
):
|
||||
debug(
|
||||
"Missing trigger_proc_order and/or trigger_proc_mem_zone - skip memory analysis"
|
||||
)
|
||||
return
|
||||
if "_watermarks" not in self.oom_result.details:
|
||||
debug("Missing watermark information - skip memory analysis")
|
||||
return
|
||||
|
||||
order = self.oom_result.details["trigger_proc_order"]
|
||||
zone = self.oom_result.details["trigger_proc_mem_zone"]
|
||||
watermark_info = self.oom_result.details["_watermarks"]
|
||||
|
||||
# "high order" requests don't trigger OOM
|
||||
if int(order) > self.oom_result.kconfig.PAGE_ALLOC_COSTLY_ORDER:
|
||||
debug("high order requests should not trigger OOM - skip memory analysis")
|
||||
self.oom_result.mem_alloc_failure = (
|
||||
OOMMemoryAllocFailureType.skipped_high_order_dont_trigger_oom
|
||||
)
|
||||
return
|
||||
|
||||
# Search node with memory shortage: watermark "free" < "min"
|
||||
node = self._extract_node_from_watermarks(zone)
|
||||
if node is None:
|
||||
return # error cause already shown as debug message
|
||||
|
||||
# the remaining code is similar to mm/page_alloc.c:__zone_watermark_ok()
|
||||
# =======================================================================
|
||||
|
||||
# calculation in kB and not in pages
|
||||
free_kb = watermark_info[zone][node]["free"]
|
||||
highest_zoneidx = self.oom_result.kconfig.ZONE_TYPES.index(zone)
|
||||
lowmem_reserve = watermark_info[zone][node]["lowmem_reserve"]
|
||||
min_kb = watermark_info[zone][node]["low"]
|
||||
page_size = self.oom_result.details["_buddyinfo_pagesize_kb"]
|
||||
|
||||
# reduce minimum watermark for high priority calls
|
||||
# ALLOC_HIGH == __GFP_HIGH
|
||||
gfp_mask_decimal = self.oom_result.details["_trigger_proc_gfp_mask_decimal"]
|
||||
gfp_flag_high = self.oom_result.kconfig.GFP_FLAGS["__GFP_DMA"]["_value"]
|
||||
if (gfp_mask_decimal & gfp_flag_high) == gfp_flag_high:
|
||||
min_kb -= int(min_kb / 2)
|
||||
|
||||
# check watermarks, if these are not met, then a high-order request also
|
||||
# cannot go ahead even if a suitable page happened to be free.
|
||||
if free_kb <= (min_kb + (lowmem_reserve[highest_zoneidx] * page_size)):
|
||||
self.oom_result.mem_alloc_failure = (
|
||||
OOMMemoryAllocFailureType.failed_below_low_watermark
|
||||
)
|
||||
return
|
||||
|
||||
# For a high-order request, check at least one suitable page is free
|
||||
if not self._check_free_chunks(order, zone, node):
|
||||
self.oom_result.mem_alloc_failure = (
|
||||
OOMMemoryAllocFailureType.failed_no_free_chunks
|
||||
)
|
||||
return
|
||||
|
||||
self.oom_result.mem_alloc_failure = (
|
||||
OOMMemoryAllocFailureType.failed_unknown_reason
|
||||
)
|
||||
|
||||
def _calc_pstable_values(self):
|
||||
"""Set additional notes to processes listed in the process table"""
|
||||
tpid = self.oom_result.details["trigger_proc_pid"]
|
||||
@ -3323,6 +3505,7 @@ class OOMAnalyser:
|
||||
self._calc_system_values()
|
||||
self._calc_trigger_process_values()
|
||||
self._calc_killed_process_values()
|
||||
self._analyse_alloc_failure()
|
||||
|
||||
def analyse(self):
|
||||
"""
|
||||
@ -4189,6 +4372,7 @@ Out of memory: Killed process 651 (unattended-upgr) total-vm:108020kB, anon-rss:
|
||||
self._show_items()
|
||||
self._show_swap_usage()
|
||||
self._show_ram_usage()
|
||||
self._show_alloc_failure()
|
||||
|
||||
# generate process table
|
||||
self._show_pstable()
|
||||
@ -4198,6 +4382,29 @@ Out of memory: Killed process 651 (unattended-upgr) total-vm:108020kB, anon-rss:
|
||||
element.textContent = self.oom_result.oom_text
|
||||
self.toggle_oom(show=False)
|
||||
|
||||
def _show_alloc_failure(self):
|
||||
"""Show details why the memory allocation failed"""
|
||||
if (
|
||||
self.oom_result.mem_alloc_failure
|
||||
== OOMMemoryAllocFailureType.failed_below_low_watermark
|
||||
):
|
||||
show_elements(".js-alloc-failure--show")
|
||||
show_elements(".js-alloc-failure-below-low-watermark--show")
|
||||
elif (
|
||||
self.oom_result.mem_alloc_failure
|
||||
== OOMMemoryAllocFailureType.failed_no_free_chunks
|
||||
):
|
||||
show_elements(".js-alloc-failure--show")
|
||||
show_elements(".js-alloc-failure-no-free-chunks--show")
|
||||
elif (
|
||||
self.oom_result.mem_alloc_failure
|
||||
== OOMMemoryAllocFailureType.failed_unknown_reason
|
||||
):
|
||||
show_elements(".js-alloc-failure--show")
|
||||
show_elements(".js-alloc-failure-unknown-reason-show")
|
||||
else:
|
||||
hide_elements(".js-alloc-failure--show")
|
||||
|
||||
def _show_ram_usage(self):
|
||||
"""Generate RAM usage diagram"""
|
||||
ram_title_attr = (
|
||||
|
82
test.py
82
test.py
@ -245,6 +245,11 @@ class TestInBrowser(TestBase):
|
||||
"99% (8343236 kBytes out of 8388604 kBytes) swap space" in explanation.text,
|
||||
"Used swap space in summary not found",
|
||||
)
|
||||
self.assertTrue(
|
||||
"The request failed because after its fulfillment the free memory would be below the memory low watermark."
|
||||
in explanation.text,
|
||||
"Memory allocation failure analysis not found",
|
||||
)
|
||||
|
||||
mem_node_info = self.driver.find_element(By.CLASS_NAME, "mem_node_info")
|
||||
self.assertEqual(
|
||||
@ -315,6 +320,10 @@ class TestInBrowser(TestBase):
|
||||
in explanation.text,
|
||||
"Used physical memory in summary not found",
|
||||
)
|
||||
self.assertTrue(
|
||||
"The request failed because" not in explanation.text,
|
||||
"Memory allocation failure analysis found",
|
||||
)
|
||||
|
||||
mem_node_info = self.driver.find_element(By.CLASS_NAME, "mem_node_info")
|
||||
self.assertEqual(
|
||||
@ -937,6 +946,10 @@ Hardware name: HP ProLiant DL385 G7, BIOS A18 12/08/2012
|
||||
'Wrong watermark level for node %s in zone "%s" (got: %d, expect %d)'
|
||||
% (node, zone, level, except_level),
|
||||
)
|
||||
node = analyser._extract_node_from_watermarks("Normal")
|
||||
self.assertTrue(
|
||||
node == 0, "Wrong node with memory shortage (got: %s, expect: 0)" % node
|
||||
)
|
||||
self.assertEqual(
|
||||
analyser.oom_result.kconfig.MAX_ORDER,
|
||||
11, # This is a hard coded value as extracted from kernel 6.2.0
|
||||
@ -944,6 +957,75 @@ Hardware name: HP ProLiant DL385 G7, BIOS A18 12/08/2012
|
||||
% analyser.oom_result.kconfig.MAX_ORDER,
|
||||
)
|
||||
|
||||
def test_011_alloc_failure(self):
|
||||
"""Test analysis why the memory allocation could be failed"""
|
||||
oom = OOMAnalyser.OOMEntity(OOMAnalyser.OOMDisplay.example_rhel7)
|
||||
analyser = OOMAnalyser.OOMAnalyser(oom)
|
||||
success = analyser.analyse()
|
||||
self.assertTrue(success, "OOM analysis failed")
|
||||
|
||||
self.assertEqual(
|
||||
analyser.oom_result.oom_type,
|
||||
OOMAnalyser.OOMEntityType.automatic,
|
||||
"OOM triggered manually",
|
||||
)
|
||||
self.assertTrue(
|
||||
"_buddyinfo" in analyser.oom_result.details, "Missing buddyinfo"
|
||||
)
|
||||
self.assertTrue(analyser.oom_result.details["_buddyinfo"], "Empty buddyinfo")
|
||||
self.assertTrue(
|
||||
"trigger_proc_order" in analyser.oom_result.details
|
||||
and "trigger_proc_mem_zone" in analyser.oom_result.details,
|
||||
"Missing trigger_proc_order and/or trigger_proc_mem_zone",
|
||||
)
|
||||
self.assertTrue(
|
||||
"_watermarks" in analyser.oom_result.details,
|
||||
"Missing watermark information - skip memory analysis",
|
||||
)
|
||||
|
||||
for zone, order, node, expected_result in [
|
||||
("DMA", 0, 0, True),
|
||||
("DMA", 6, 0, True),
|
||||
("DMA32", 0, 0, True),
|
||||
("DMA32", 10, 0, False),
|
||||
("Normal", 0, 0, True),
|
||||
("Normal", 0, 1, True),
|
||||
("Normal", 6, 0, False),
|
||||
("Normal", 6, 1, True),
|
||||
("Normal", 7, 0, False),
|
||||
("Normal", 7, 1, True),
|
||||
("Normal", 9, 0, False),
|
||||
("Normal", 9, 1, False),
|
||||
]:
|
||||
result = analyser._check_free_chunks(order, zone, node)
|
||||
self.assertEqual(
|
||||
result,
|
||||
expected_result,
|
||||
"Wrong result of the check for free chunks with the same or higher order for Node %d, "
|
||||
'Zone "%s" and order %d (got: %s, expected %s)'
|
||||
% (node, zone, order, result, expected_result),
|
||||
)
|
||||
|
||||
# Search node with memory shortage: watermark "free" < "min"
|
||||
for zone, expected_node in [
|
||||
("DMA", None),
|
||||
("DMA32", None),
|
||||
("Normal", 0),
|
||||
]:
|
||||
node = analyser._extract_node_from_watermarks(zone)
|
||||
self.assertEqual(
|
||||
node,
|
||||
expected_node,
|
||||
'Wrong result if a node has memory shortage in zone "%s" (got: %s, expected %s)'
|
||||
% (zone, node, expected_node),
|
||||
)
|
||||
|
||||
self.assertEqual(
|
||||
analyser.oom_result.mem_alloc_failure,
|
||||
OOMAnalyser.OOMMemoryAllocFailureType.failed_below_low_watermark,
|
||||
"Unexpected reason why the memory allocation has failed.",
|
||||
)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
unittest.main(verbosity=2)
|
||||
|
Loading…
Reference in New Issue
Block a user