From ec8b4c23748ba1dc87617f2b149dc40c2c4e6936 Mon Sep 17 00:00:00 2001 From: Carsten Grohmann Date: Thu, 2 Mar 2023 20:43:35 +0100 Subject: [PATCH] Add analysis why the memory request failed --- OOMAnalyser.html | 57 +++++++++-- OOMAnalyser.py | 243 +++++++++++++++++++++++++++++++++++++++++++---- test.py | 82 ++++++++++++++++ 3 files changed, 354 insertions(+), 28 deletions(-) diff --git a/OOMAnalyser.html b/OOMAnalyser.html index d3c55f0..db48703 100644 --- a/OOMAnalyser.html +++ b/OOMAnalyser.html @@ -68,6 +68,22 @@ THIS PROGRAM COMES WITH NO WARRANTY display: block; } + .js-alloc-failure--show { + /* empty - used to hide/show detailed allocation failure analysis */ + } + + .js-alloc-failure-below-low-watermark--show { + /* empty - used to hide/show details for failed memory allocations */ + } + + .js-alloc-failure-no-free-chunks--show { + /* empty - used to hide/show details for failed memory allocations */ + } + + .js-alloc-failure-unknown-reason-show { + /* empty - used to hide/show details for failed memory allocations */ + } + .js-killed-proc-score--show { /* empty - used to hide/show OOM score of killed process */ } @@ -378,16 +394,6 @@ window.onerror = function (msg, url, lineNo, columnNo, errorObj) { The OOM killer calculates a score for each process and terminates the process with the highest score.

-

- The process "" - (PID ) - - with an OOM score of - - has been terminated. It uses - () of the resident memory. -

-

The system has physical memory and @@ -406,6 +412,36 @@ window.onerror = function (msg, url, lineNo, columnNo, errorObj) { physical memory are in use.

+

+ The process "" + (PID ) + + with an OOM score of + + has been terminated. It uses + () of the resident memory. +

+
+

+ A memory chunk of order ( + / + ) has been requested from the + "" memory zone. + + + The request failed because after its fulfillment the free memory would be below the memory + low watermark. + + + If this requirement were met, the free memory would still be above the low memory watermark. + The request failed because there is no free chunk in the current or higher order. + + + The request failed, but the reason is unknown. + + This analysis result is an estimate because the kernel reduces the minimum watermark in some rare cases. +

+

Details of analysis

@@ -955,6 +991,7 @@ window.onerror = function (msg, url, lineNo, columnNo, errorObj) {
  • Fix and rework calculation of GFP flags
  • Add GFP flags for more kernel releases
  • Display missing memory chunks (buddyinfo) again
  • +
  • Add analysis why the memory request failed
  • ...
  • diff --git a/OOMAnalyser.py b/OOMAnalyser.py index 7ad6072..aee3b38 100644 --- a/OOMAnalyser.py +++ b/OOMAnalyser.py @@ -132,6 +132,28 @@ class OOMEntityType: manual = 2 +class OOMMemoryAllocFailureType: + """Enum to store the results why the memory allocation could have failed""" + + not_started = 0 + """Analysis not started""" + + missing_data = 1 + """Missing data to start analysis""" + + failed_below_low_watermark = 2 + """Failed, because after satisfying this request, the free memory will be below the low memory watermark""" + + failed_no_free_chunks = 3 + """Failed, because no suitable chunk is free in the current or any higher order.""" + + failed_unknown_reason = 4 + """Failed, but the reason is unknown""" + + skipped_high_order_dont_trigger_oom = 5 + """"high order" requests don't trigger OOM""" + + def is_visible(element): return element.offsetWidth > 0 and element.offsetHeight > 0 @@ -484,6 +506,11 @@ class BaseKernelConfig: ] """Elements of the process table""" + PAGE_ALLOC_COSTLY_ORDER = 3 + """ + Requests with order > PAGE_ALLOC_COSTLY_ORDER will never trigger the OOM-killer to satisfy the request. + """ + pstable_html = [ "PID", "UID", @@ -549,6 +576,13 @@ class BaseKernelConfig: :type: str """ + ZONE_TYPES = ["DMA", "DMA32", "Normal", "HighMem", "Movable"] + """ + List of memory zones + + @type: List(str) + """ + def __init__(self): super().__init__() @@ -2670,26 +2704,9 @@ class OOMEntity: class OOMResult: """Results of an OOM analysis""" - kconfig = BaseKernelConfig() - """Kernel configuration""" - details = {} """Extracted result""" - oom_entity = None - """ - State of this OOM (unknown, incomplete, ...) - - :type: OOMEntityState - """ - - oom_type = OOMEntityType.unknown - """ - Type of this OOM (manually or automatically triggered) - - :type: OOMEntityType - """ - error_msg = "" """ Error message @@ -2697,6 +2714,9 @@ class OOMResult: @type: str """ + kconfig = BaseKernelConfig() + """Kernel configuration""" + kversion = None """ Kernel version @@ -2704,6 +2724,19 @@ class OOMResult: @type: str """ + mem_alloc_failure = OOMMemoryAllocFailureType.not_started + """State/result of the memory allocation failure analysis + + @see: OOMAnalyser._analyse_alloc_failure() + """ + + oom_entity = None + """ + State of this OOM (unknown, incomplete, ...) + + :type: OOMEntityState + """ + oom_text = None """ OOM text @@ -2711,6 +2744,13 @@ class OOMResult: @type: str """ + oom_type = OOMEntityType.unknown + """ + Type of this OOM (manually or automatically triggered) + + :type: OOMEntityType + """ + swap_active = False """ Swap space active or inactive @@ -3063,7 +3103,8 @@ class OOMAnalyser: Extract memory watermark information from all zones This function fills: - * OOMResult.details["_watermarks"] with [][][(free|min|low|high)] = + * OOMResult.details["_watermarks"] with [][][(free|min|low|high)] = int + * OOMResult.details["_watermarks"] with [][][(lowmem_reserve)] = List(int) """ self.oom_result.details["_watermarks"] = {} watermark_info = self.oom_result.details["_watermarks"] @@ -3075,9 +3116,16 @@ class OOMAnalyser: # Therefore, we reset the counter by one line. self.oom_entity.back() + node = None + zone = None for line in self.oom_entity: match = self.REC_WATERMARK.match(line) if not match: + if line.startswith("lowmem_reserve[]:"): + # zone and node are defined in the previous round + watermark_info[zone][node]["lowmem_reserve"] = [ + int(v) for v in line.split()[1:] + ] continue node = int(match.group("node")) @@ -3089,6 +3137,32 @@ class OOMAnalyser: for i in ["free", "min", "low", "high"]: watermark_info[zone][node][i] = int(match.group(i)) + def _extract_node_from_watermarks(self, zone): + """ + Search node with memory shortage: watermark "free" < "min" + + @param str zone: Requested zone + @return: First node with memory shortage or None if no node found + @rtype: None|int + """ + watermark_info = self.oom_result.details["_watermarks"] + if zone not in watermark_info: + debug( + "Missing watermark info for zone {} - skip memory analysis".format(zone) + ) + return None + # __pragma__ ('jsiter') + for node in watermark_info[zone]: + if watermark_info[zone][node]["free"] < watermark_info[zone][node]["min"]: + return int(node) + # __pragma__ ('nojsiter') + + debug( + "Node with current memory shortage cannot be determined - skip memory analysis" + ) + + return None + def _gfp_hex2flags(self, hexvalue): """\ Convert the hexadecimal value into flags specified by definition @@ -3170,6 +3244,114 @@ class OOMAnalyser: ps_index.sort(key=int) self.oom_result.details["_pstable_index"] = ps_index + def _check_free_chunks(self, start_with_order, zone, node): + """Check for at least one free chunk in the current or any higher order. + + Returns True, if at lease one suitable chunk is free. + Returns None, if buddyinfo doesn't contain information for the requested node, order or zone + + @param int start_with_order: Start checking with this order + @param str zone: Memory zone + @param int node: Node number + @rtype: None|bool + """ + if not self.oom_result.details["_buddyinfo"]: + return None + buddyinfo = self.oom_result.details["_buddyinfo"] + if zone not in buddyinfo: + return None + + for order in range(start_with_order, self.oom_result.kconfig.MAX_ORDER): + if order not in buddyinfo[zone]: + break + if node not in buddyinfo[zone][order]: + return None + free_chunks = buddyinfo[zone][order][node] + if free_chunks: + return True + return False + + def _analyse_alloc_failure(self): + """ + Analyse why the memory allocation could be failed. + + The code in this function is inspired by mm/page_alloc.c:__zone_watermark_ok() + """ + self.oom_result.mem_alloc_failure = OOMMemoryAllocFailureType.not_started + + if self.oom_result.oom_type == OOMEntityType.manual: + debug("OOM triggered manually - skip memory analysis") + return + if "_buddyinfo" not in self.oom_result.details: + debug("Missing buddyinfo - skip memory analysis") + return + if not self.oom_result.details["_buddyinfo"]: + debug("Empty buddyinfo - skip memory analysis") + return + if ("trigger_proc_order" not in self.oom_result.details) or ( + "trigger_proc_mem_zone" not in self.oom_result.details + ): + debug( + "Missing trigger_proc_order and/or trigger_proc_mem_zone - skip memory analysis" + ) + return + if "_watermarks" not in self.oom_result.details: + debug("Missing watermark information - skip memory analysis") + return + + order = self.oom_result.details["trigger_proc_order"] + zone = self.oom_result.details["trigger_proc_mem_zone"] + watermark_info = self.oom_result.details["_watermarks"] + + # "high order" requests don't trigger OOM + if int(order) > self.oom_result.kconfig.PAGE_ALLOC_COSTLY_ORDER: + debug("high order requests should not trigger OOM - skip memory analysis") + self.oom_result.mem_alloc_failure = ( + OOMMemoryAllocFailureType.skipped_high_order_dont_trigger_oom + ) + return + + # Search node with memory shortage: watermark "free" < "min" + node = self._extract_node_from_watermarks(zone) + if node is None: + return # error cause already shown as debug message + + # the remaining code is similar to mm/page_alloc.c:__zone_watermark_ok() + # ======================================================================= + + # calculation in kB and not in pages + free_kb = watermark_info[zone][node]["free"] + highest_zoneidx = self.oom_result.kconfig.ZONE_TYPES.index(zone) + lowmem_reserve = watermark_info[zone][node]["lowmem_reserve"] + min_kb = watermark_info[zone][node]["low"] + page_size = self.oom_result.details["_buddyinfo_pagesize_kb"] + + # reduce minimum watermark for high priority calls + # ALLOC_HIGH == __GFP_HIGH + gfp_mask_decimal = self.oom_result.details["_trigger_proc_gfp_mask_decimal"] + gfp_flag_high = self.oom_result.kconfig.GFP_FLAGS["__GFP_DMA"]["_value"] + if (gfp_mask_decimal & gfp_flag_high) == gfp_flag_high: + min_kb -= int(min_kb / 2) + + # check watermarks, if these are not met, then a high-order request also + # cannot go ahead even if a suitable page happened to be free. + if free_kb <= (min_kb + (lowmem_reserve[highest_zoneidx] * page_size)): + self.oom_result.mem_alloc_failure = ( + OOMMemoryAllocFailureType.failed_below_low_watermark + ) + return + + # For a high-order request, check at least one suitable page is free + if not self._check_free_chunks(order, zone, node): + self.oom_result.mem_alloc_failure = ( + OOMMemoryAllocFailureType.failed_no_free_chunks + ) + return + + self.oom_result.mem_alloc_failure = ( + OOMMemoryAllocFailureType.failed_unknown_reason + ) + def _calc_pstable_values(self): """Set additional notes to processes listed in the process table""" tpid = self.oom_result.details["trigger_proc_pid"] @@ -3323,6 +3505,7 @@ class OOMAnalyser: self._calc_system_values() self._calc_trigger_process_values() self._calc_killed_process_values() + self._analyse_alloc_failure() def analyse(self): """ @@ -4189,6 +4372,7 @@ Out of memory: Killed process 651 (unattended-upgr) total-vm:108020kB, anon-rss: self._show_items() self._show_swap_usage() self._show_ram_usage() + self._show_alloc_failure() # generate process table self._show_pstable() @@ -4198,6 +4382,29 @@ Out of memory: Killed process 651 (unattended-upgr) total-vm:108020kB, anon-rss: element.textContent = self.oom_result.oom_text self.toggle_oom(show=False) + def _show_alloc_failure(self): + """Show details why the memory allocation failed""" + if ( + self.oom_result.mem_alloc_failure + == OOMMemoryAllocFailureType.failed_below_low_watermark + ): + show_elements(".js-alloc-failure--show") + show_elements(".js-alloc-failure-below-low-watermark--show") + elif ( + self.oom_result.mem_alloc_failure + == OOMMemoryAllocFailureType.failed_no_free_chunks + ): + show_elements(".js-alloc-failure--show") + show_elements(".js-alloc-failure-no-free-chunks--show") + elif ( + self.oom_result.mem_alloc_failure + == OOMMemoryAllocFailureType.failed_unknown_reason + ): + show_elements(".js-alloc-failure--show") + show_elements(".js-alloc-failure-unknown-reason-show") + else: + hide_elements(".js-alloc-failure--show") + def _show_ram_usage(self): """Generate RAM usage diagram""" ram_title_attr = ( diff --git a/test.py b/test.py index 3dbba88..d9fd564 100755 --- a/test.py +++ b/test.py @@ -245,6 +245,11 @@ class TestInBrowser(TestBase): "99% (8343236 kBytes out of 8388604 kBytes) swap space" in explanation.text, "Used swap space in summary not found", ) + self.assertTrue( + "The request failed because after its fulfillment the free memory would be below the memory low watermark." + in explanation.text, + "Memory allocation failure analysis not found", + ) mem_node_info = self.driver.find_element(By.CLASS_NAME, "mem_node_info") self.assertEqual( @@ -315,6 +320,10 @@ class TestInBrowser(TestBase): in explanation.text, "Used physical memory in summary not found", ) + self.assertTrue( + "The request failed because" not in explanation.text, + "Memory allocation failure analysis found", + ) mem_node_info = self.driver.find_element(By.CLASS_NAME, "mem_node_info") self.assertEqual( @@ -937,6 +946,10 @@ Hardware name: HP ProLiant DL385 G7, BIOS A18 12/08/2012 'Wrong watermark level for node %s in zone "%s" (got: %d, expect %d)' % (node, zone, level, except_level), ) + node = analyser._extract_node_from_watermarks("Normal") + self.assertTrue( + node == 0, "Wrong node with memory shortage (got: %s, expect: 0)" % node + ) self.assertEqual( analyser.oom_result.kconfig.MAX_ORDER, 11, # This is a hard coded value as extracted from kernel 6.2.0 @@ -944,6 +957,75 @@ Hardware name: HP ProLiant DL385 G7, BIOS A18 12/08/2012 % analyser.oom_result.kconfig.MAX_ORDER, ) + def test_011_alloc_failure(self): + """Test analysis why the memory allocation could be failed""" + oom = OOMAnalyser.OOMEntity(OOMAnalyser.OOMDisplay.example_rhel7) + analyser = OOMAnalyser.OOMAnalyser(oom) + success = analyser.analyse() + self.assertTrue(success, "OOM analysis failed") + + self.assertEqual( + analyser.oom_result.oom_type, + OOMAnalyser.OOMEntityType.automatic, + "OOM triggered manually", + ) + self.assertTrue( + "_buddyinfo" in analyser.oom_result.details, "Missing buddyinfo" + ) + self.assertTrue(analyser.oom_result.details["_buddyinfo"], "Empty buddyinfo") + self.assertTrue( + "trigger_proc_order" in analyser.oom_result.details + and "trigger_proc_mem_zone" in analyser.oom_result.details, + "Missing trigger_proc_order and/or trigger_proc_mem_zone", + ) + self.assertTrue( + "_watermarks" in analyser.oom_result.details, + "Missing watermark information - skip memory analysis", + ) + + for zone, order, node, expected_result in [ + ("DMA", 0, 0, True), + ("DMA", 6, 0, True), + ("DMA32", 0, 0, True), + ("DMA32", 10, 0, False), + ("Normal", 0, 0, True), + ("Normal", 0, 1, True), + ("Normal", 6, 0, False), + ("Normal", 6, 1, True), + ("Normal", 7, 0, False), + ("Normal", 7, 1, True), + ("Normal", 9, 0, False), + ("Normal", 9, 1, False), + ]: + result = analyser._check_free_chunks(order, zone, node) + self.assertEqual( + result, + expected_result, + "Wrong result of the check for free chunks with the same or higher order for Node %d, " + 'Zone "%s" and order %d (got: %s, expected %s)' + % (node, zone, order, result, expected_result), + ) + + # Search node with memory shortage: watermark "free" < "min" + for zone, expected_node in [ + ("DMA", None), + ("DMA32", None), + ("Normal", 0), + ]: + node = analyser._extract_node_from_watermarks(zone) + self.assertEqual( + node, + expected_node, + 'Wrong result if a node has memory shortage in zone "%s" (got: %s, expected %s)' + % (zone, node, expected_node), + ) + + self.assertEqual( + analyser.oom_result.mem_alloc_failure, + OOMAnalyser.OOMMemoryAllocFailureType.failed_below_low_watermark, + "Unexpected reason why the memory allocation has failed.", + ) + if __name__ == "__main__": unittest.main(verbosity=2)