Add analysis why the memory request failed

2023-03-02 20:43:35 +01:00 · 2023-03-02 20:43:35 +01:00 · ec8b4c2374
commit ec8b4c2374
parent beb00ce17e
3 changed files with 354 additions and 28 deletions
--- a/OOMAnalyser.html
+++ b/OOMAnalyser.html
@ -68,6 +68,22 @@ THIS PROGRAM COMES WITH NO WARRANTY
            display: block;
        }

+        .js-alloc-failure--show {
+            /* empty - used to hide/show detailed allocation failure analysis */
+        }
+
+        .js-alloc-failure-below-low-watermark--show {
+            /* empty - used to hide/show details for failed memory allocations */
+        }
+
+        .js-alloc-failure-no-free-chunks--show {
+            /* empty - used to hide/show details for failed memory allocations */
+        }
+
+        .js-alloc-failure-unknown-reason-show {
+            /* empty - used to hide/show details for failed memory allocations */
+        }
+
        .js-killed-proc-score--show {
            /* empty - used to hide/show OOM score of killed process */
        }
@ -378,16 +394,6 @@ window.onerror = function (msg, url, lineNo, columnNo, errorObj) {
                The OOM killer calculates a score for each process and terminates the process with the highest score.
            </p>
        </div>
-        <p>
-            The process &quot;<span class="killed_proc_name"></span>&quot;
-            (PID <span class="killed_proc_pid"></span>)
-            <span class="js-text--default-show js-killed-proc-score--show">
-                with an OOM score of <span class="killed_proc_score"></span>
-            </span>
-            has been terminated. It uses <span class="killed_proc_rss_percent"></span>
-            (<span class="killed_proc_total_rss_kb"></span>) of the resident memory.
-        </p>
-
        <div class="js-text--default-show js-swap-active--show">
            <p>
                The system has <span class="system_total_ram_kb"></span> physical memory and
@ -406,6 +412,36 @@ window.onerror = function (msg, url, lineNo, columnNo, errorObj) {
                physical memory are in use.
            </p>
        </div>
+        <p>
+            The process &quot;<span class="killed_proc_name"></span>&quot;
+            (PID <span class="killed_proc_pid"></span>)
+            <span class="js-text--default-show js-killed-proc-score--show">
+                with an OOM score of <span class="killed_proc_score"></span>
+            </span>
+            has been terminated. It uses <span class="killed_proc_rss_percent"></span>
+            (<span class="killed_proc_total_rss_kb"></span>) of the resident memory.
+        </p>
+        <div class="js-text--default-hide js-alloc-failure--show">
+            <p>
+                A memory chunk of order <span class="trigger_proc_order"></span> (
+                <span class="trigger_proc_requested_memory_pages"></span> /
+                <span class="trigger_proc_requested_memory_pages_kb"></span> ) has been requested from the
+                &quot;<span class="trigger_proc_mem_zone"></span>&quot; memory zone.
+
+                <span class="js-text--default-hide js-alloc-failure-below-low-watermark--show">
+                    The request failed because after its fulfillment the free memory would be below the memory
+                    low watermark.
+                </span>
+                <span class="js-text--default-hide js-alloc-failure-no-free-chunks--show">
+                    If this requirement were met, the free memory would still be above the low memory watermark.
+                    The request failed because there is no free chunk in the current or higher order.
+                </span>
+                <span class="js-text--default-hide js-alloc-failure-unknown-reason-show">
+                    The request failed, but the reason is unknown.
+                </span>
+                This analysis result is an estimate because the kernel reduces the minimum watermark in some rare cases.
+            </p>
+        </div>
    </div>

    <h3>Details of analysis</h3>
@ -955,6 +991,7 @@ window.onerror = function (msg, url, lineNo, columnNo, errorObj) {
        <li>Fix and rework calculation of GFP flags</li>
        <li>Add GFP flags for more kernel releases</li>
        <li>Display missing memory chunks (buddyinfo) again</li>
+        <li>Add analysis why the memory request failed</li>
        <li>...</li>
    </ol>

--- a/OOMAnalyser.py
+++ b/OOMAnalyser.py
@ -132,6 +132,28 @@ class OOMEntityType:
    manual = 2


+class OOMMemoryAllocFailureType:
+    """Enum to store the results why the memory allocation could have failed"""
+
+    not_started = 0
+    """Analysis not started"""
+
+    missing_data = 1
+    """Missing data to start analysis"""
+
+    failed_below_low_watermark = 2
+    """Failed, because after satisfying this request, the free memory will be below the low memory watermark"""
+
+    failed_no_free_chunks = 3
+    """Failed, because no suitable chunk is free in the current or any higher order."""
+
+    failed_unknown_reason = 4
+    """Failed, but the reason is unknown"""
+
+    skipped_high_order_dont_trigger_oom = 5
+    """"high order" requests don't trigger OOM"""
+
+
 def is_visible(element):
    return element.offsetWidth > 0 and element.offsetHeight > 0

@ -484,6 +506,11 @@ class BaseKernelConfig:
    ]
    """Elements of the process table"""

+    PAGE_ALLOC_COSTLY_ORDER = 3
+    """
+    Requests with order > PAGE_ALLOC_COSTLY_ORDER will never trigger the OOM-killer to satisfy the request.
+    """
+
    pstable_html = [
        "PID",
        "UID",
@ -549,6 +576,13 @@ class BaseKernelConfig:
    :type: str
    """

+    ZONE_TYPES = ["DMA", "DMA32", "Normal", "HighMem", "Movable"]
+    """
+    List of memory zones
+
+    @type: List(str)
+    """
+
    def __init__(self):
        super().__init__()

@ -2670,26 +2704,9 @@ class OOMEntity:
 class OOMResult:
    """Results of an OOM analysis"""

-    kconfig = BaseKernelConfig()
-    """Kernel configuration"""
-
    details = {}
    """Extracted result"""

-    oom_entity = None
-    """
-    State of this OOM (unknown, incomplete, ...)
-
-    :type: OOMEntityState
-    """
-
-    oom_type = OOMEntityType.unknown
-    """
-    Type of this OOM (manually or automatically triggered)
-
-    :type: OOMEntityType
-    """
-
    error_msg = ""
    """
    Error message
@ -2697,6 +2714,9 @@ class OOMResult:
    @type: str
    """

+    kconfig = BaseKernelConfig()
+    """Kernel configuration"""
+
    kversion = None
    """
    Kernel version
@ -2704,6 +2724,19 @@ class OOMResult:
    @type: str
    """

+    mem_alloc_failure = OOMMemoryAllocFailureType.not_started
+    """State/result of the memory allocation failure analysis
+
+    @see: OOMAnalyser._analyse_alloc_failure()
+    """
+
+    oom_entity = None
+    """
+    State of this OOM (unknown, incomplete, ...)
+
+    :type: OOMEntityState
+    """
+
    oom_text = None
    """
    OOM text
@ -2711,6 +2744,13 @@ class OOMResult:
    @type: str
    """

+    oom_type = OOMEntityType.unknown
+    """
+    Type of this OOM (manually or automatically triggered)
+
+    :type: OOMEntityType
+    """
+
    swap_active = False
    """
    Swap space active or inactive
@ -3063,7 +3103,8 @@ class OOMAnalyser:
        Extract memory watermark information from all zones

        This function fills:
-        * OOMResult.details["_watermarks"] with [<zone>][<node>][(free|min|low|high)] = <XXX>
+        * OOMResult.details["_watermarks"] with [<zone>][<node>][(free|min|low|high)] = int
+        * OOMResult.details["_watermarks"] with [<zone>][<node>][(lowmem_reserve)] = List(int)
        """
        self.oom_result.details["_watermarks"] = {}
        watermark_info = self.oom_result.details["_watermarks"]
@ -3075,9 +3116,16 @@ class OOMAnalyser:
        # Therefore, we reset the counter by one line.
        self.oom_entity.back()

+        node = None
+        zone = None
        for line in self.oom_entity:
            match = self.REC_WATERMARK.match(line)
            if not match:
+                if line.startswith("lowmem_reserve[]:"):
+                    # zone and node are defined in the previous round
+                    watermark_info[zone][node]["lowmem_reserve"] = [
+                        int(v) for v in line.split()[1:]
+                    ]
                continue

            node = int(match.group("node"))
@ -3089,6 +3137,32 @@ class OOMAnalyser:
            for i in ["free", "min", "low", "high"]:
                watermark_info[zone][node][i] = int(match.group(i))

+    def _extract_node_from_watermarks(self, zone):
+        """
+        Search node with memory shortage: watermark "free" < "min"
+
+        @param str zone: Requested zone
+        @return: First node with memory shortage or None if no node found
+        @rtype: None|int
+        """
+        watermark_info = self.oom_result.details["_watermarks"]
+        if zone not in watermark_info:
+            debug(
+                "Missing watermark info for zone {} - skip memory analysis".format(zone)
+            )
+            return None
+        # __pragma__ ('jsiter')
+        for node in watermark_info[zone]:
+            if watermark_info[zone][node]["free"] < watermark_info[zone][node]["min"]:
+                return int(node)
+        # __pragma__ ('nojsiter')
+
+        debug(
+            "Node with current memory shortage cannot be determined - skip memory analysis"
+        )
+
+        return None
+
    def _gfp_hex2flags(self, hexvalue):
        """\
        Convert the hexadecimal value into flags specified by definition
@ -3170,6 +3244,114 @@ class OOMAnalyser:
        ps_index.sort(key=int)
        self.oom_result.details["_pstable_index"] = ps_index

+    def _check_free_chunks(self, start_with_order, zone, node):
+        """Check for at least one free chunk in the current or any higher order.
+
+        Returns True, if at lease one suitable chunk is free.
+        Returns None, if buddyinfo doesn't contain information for the requested node, order or zone
+
+        @param int start_with_order: Start checking with this order
+        @param str zone: Memory zone
+        @param int node: Node number
+        @rtype: None|bool
+        """
+        if not self.oom_result.details["_buddyinfo"]:
+            return None
+        buddyinfo = self.oom_result.details["_buddyinfo"]
+        if zone not in buddyinfo:
+            return None
+
+        for order in range(start_with_order, self.oom_result.kconfig.MAX_ORDER):
+            if order not in buddyinfo[zone]:
+                break
+            if node not in buddyinfo[zone][order]:
+                return None
+            free_chunks = buddyinfo[zone][order][node]
+            if free_chunks:
+                return True
+        return False
+
+    def _analyse_alloc_failure(self):
+        """
+        Analyse why the memory allocation could be failed.
+
+        The code in this function is inspired by mm/page_alloc.c:__zone_watermark_ok()
+        """
+        self.oom_result.mem_alloc_failure = OOMMemoryAllocFailureType.not_started
+
+        if self.oom_result.oom_type == OOMEntityType.manual:
+            debug("OOM triggered manually - skip memory analysis")
+            return
+        if "_buddyinfo" not in self.oom_result.details:
+            debug("Missing buddyinfo - skip memory analysis")
+            return
+        if not self.oom_result.details["_buddyinfo"]:
+            debug("Empty buddyinfo - skip memory analysis")
+            return
+        if ("trigger_proc_order" not in self.oom_result.details) or (
+            "trigger_proc_mem_zone" not in self.oom_result.details
+        ):
+            debug(
+                "Missing trigger_proc_order and/or trigger_proc_mem_zone - skip memory analysis"
+            )
+            return
+        if "_watermarks" not in self.oom_result.details:
+            debug("Missing watermark information - skip memory analysis")
+            return
+
+        order = self.oom_result.details["trigger_proc_order"]
+        zone = self.oom_result.details["trigger_proc_mem_zone"]
+        watermark_info = self.oom_result.details["_watermarks"]
+
+        # "high order" requests don't trigger OOM
+        if int(order) > self.oom_result.kconfig.PAGE_ALLOC_COSTLY_ORDER:
+            debug("high order requests should not trigger OOM - skip memory analysis")
+            self.oom_result.mem_alloc_failure = (
+                OOMMemoryAllocFailureType.skipped_high_order_dont_trigger_oom
+            )
+            return
+
+        # Search node with memory shortage: watermark "free" < "min"
+        node = self._extract_node_from_watermarks(zone)
+        if node is None:
+            return  # error cause already shown as debug message
+
+        # the remaining code is similar to mm/page_alloc.c:__zone_watermark_ok()
+        # =======================================================================
+
+        # calculation in kB and not in pages
+        free_kb = watermark_info[zone][node]["free"]
+        highest_zoneidx = self.oom_result.kconfig.ZONE_TYPES.index(zone)
+        lowmem_reserve = watermark_info[zone][node]["lowmem_reserve"]
+        min_kb = watermark_info[zone][node]["low"]
+        page_size = self.oom_result.details["_buddyinfo_pagesize_kb"]
+
+        # reduce minimum watermark for high priority calls
+        # ALLOC_HIGH == __GFP_HIGH
+        gfp_mask_decimal = self.oom_result.details["_trigger_proc_gfp_mask_decimal"]
+        gfp_flag_high = self.oom_result.kconfig.GFP_FLAGS["__GFP_DMA"]["_value"]
+        if (gfp_mask_decimal & gfp_flag_high) == gfp_flag_high:
+            min_kb -= int(min_kb / 2)
+
+        # check watermarks, if these are not met, then a high-order request also
+        # cannot go ahead even if a suitable page happened to be free.
+        if free_kb <= (min_kb + (lowmem_reserve[highest_zoneidx] * page_size)):
+            self.oom_result.mem_alloc_failure = (
+                OOMMemoryAllocFailureType.failed_below_low_watermark
+            )
+            return
+
+        # For a high-order request, check at least one suitable page is free
+        if not self._check_free_chunks(order, zone, node):
+            self.oom_result.mem_alloc_failure = (
+                OOMMemoryAllocFailureType.failed_no_free_chunks
+            )
+            return
+
+        self.oom_result.mem_alloc_failure = (
+            OOMMemoryAllocFailureType.failed_unknown_reason
+        )
+
    def _calc_pstable_values(self):
        """Set additional notes to processes listed in the process table"""
        tpid = self.oom_result.details["trigger_proc_pid"]
@ -3323,6 +3505,7 @@ class OOMAnalyser:
        self._calc_system_values()
        self._calc_trigger_process_values()
        self._calc_killed_process_values()
+        self._analyse_alloc_failure()

    def analyse(self):
        """
@ -4189,6 +4372,7 @@ Out of memory: Killed process 651 (unattended-upgr) total-vm:108020kB, anon-rss:
        self._show_items()
        self._show_swap_usage()
        self._show_ram_usage()
+        self._show_alloc_failure()

        # generate process table
        self._show_pstable()
@ -4198,6 +4382,29 @@ Out of memory: Killed process 651 (unattended-upgr) total-vm:108020kB, anon-rss:
        element.textContent = self.oom_result.oom_text
        self.toggle_oom(show=False)

+    def _show_alloc_failure(self):
+        """Show details why the memory allocation failed"""
+        if (
+            self.oom_result.mem_alloc_failure
+            == OOMMemoryAllocFailureType.failed_below_low_watermark
+        ):
+            show_elements(".js-alloc-failure--show")
+            show_elements(".js-alloc-failure-below-low-watermark--show")
+        elif (
+            self.oom_result.mem_alloc_failure
+            == OOMMemoryAllocFailureType.failed_no_free_chunks
+        ):
+            show_elements(".js-alloc-failure--show")
+            show_elements(".js-alloc-failure-no-free-chunks--show")
+        elif (
+            self.oom_result.mem_alloc_failure
+            == OOMMemoryAllocFailureType.failed_unknown_reason
+        ):
+            show_elements(".js-alloc-failure--show")
+            show_elements(".js-alloc-failure-unknown-reason-show")
+        else:
+            hide_elements(".js-alloc-failure--show")
+
    def _show_ram_usage(self):
        """Generate RAM usage diagram"""
        ram_title_attr = (
--- a/test.py
+++ b/test.py
@ -245,6 +245,11 @@ class TestInBrowser(TestBase):
            "99% (8343236 kBytes out of 8388604 kBytes) swap space" in explanation.text,
            "Used swap space in summary not found",
        )
+        self.assertTrue(
+            "The request failed because after its fulfillment the free memory would be below the memory low watermark."
+            in explanation.text,
+            "Memory allocation failure analysis not found",
+        )

        mem_node_info = self.driver.find_element(By.CLASS_NAME, "mem_node_info")
        self.assertEqual(
@ -315,6 +320,10 @@ class TestInBrowser(TestBase):
            in explanation.text,
            "Used physical memory in summary not found",
        )
+        self.assertTrue(
+            "The request failed because" not in explanation.text,
+            "Memory allocation failure analysis found",
+        )

        mem_node_info = self.driver.find_element(By.CLASS_NAME, "mem_node_info")
        self.assertEqual(
@ -937,6 +946,10 @@ Hardware name: HP ProLiant DL385 G7, BIOS A18 12/08/2012
                'Wrong watermark level for node %s in zone "%s" (got: %d, expect %d)'
                % (node, zone, level, except_level),
            )
+        node = analyser._extract_node_from_watermarks("Normal")
+        self.assertTrue(
+            node == 0, "Wrong node with memory shortage (got: %s, expect: 0)" % node
+        )
        self.assertEqual(
            analyser.oom_result.kconfig.MAX_ORDER,
            11,  # This is a hard coded value as extracted from kernel 6.2.0
@ -944,6 +957,75 @@ Hardware name: HP ProLiant DL385 G7, BIOS A18 12/08/2012
            % analyser.oom_result.kconfig.MAX_ORDER,
        )

+    def test_011_alloc_failure(self):
+        """Test analysis why the memory allocation could be failed"""
+        oom = OOMAnalyser.OOMEntity(OOMAnalyser.OOMDisplay.example_rhel7)
+        analyser = OOMAnalyser.OOMAnalyser(oom)
+        success = analyser.analyse()
+        self.assertTrue(success, "OOM analysis failed")
+
+        self.assertEqual(
+            analyser.oom_result.oom_type,
+            OOMAnalyser.OOMEntityType.automatic,
+            "OOM triggered manually",
+        )
+        self.assertTrue(
+            "_buddyinfo" in analyser.oom_result.details, "Missing buddyinfo"
+        )
+        self.assertTrue(analyser.oom_result.details["_buddyinfo"], "Empty buddyinfo")
+        self.assertTrue(
+            "trigger_proc_order" in analyser.oom_result.details
+            and "trigger_proc_mem_zone" in analyser.oom_result.details,
+            "Missing trigger_proc_order and/or trigger_proc_mem_zone",
+        )
+        self.assertTrue(
+            "_watermarks" in analyser.oom_result.details,
+            "Missing watermark information - skip memory analysis",
+        )
+
+        for zone, order, node, expected_result in [
+            ("DMA", 0, 0, True),
+            ("DMA", 6, 0, True),
+            ("DMA32", 0, 0, True),
+            ("DMA32", 10, 0, False),
+            ("Normal", 0, 0, True),
+            ("Normal", 0, 1, True),
+            ("Normal", 6, 0, False),
+            ("Normal", 6, 1, True),
+            ("Normal", 7, 0, False),
+            ("Normal", 7, 1, True),
+            ("Normal", 9, 0, False),
+            ("Normal", 9, 1, False),
+        ]:
+            result = analyser._check_free_chunks(order, zone, node)
+            self.assertEqual(
+                result,
+                expected_result,
+                "Wrong result of the check for free chunks with the same or higher order for Node %d, "
+                'Zone "%s" and order %d (got: %s, expected %s)'
+                % (node, zone, order, result, expected_result),
+            )
+
+        # Search node with memory shortage: watermark "free" < "min"
+        for zone, expected_node in [
+            ("DMA", None),
+            ("DMA32", None),
+            ("Normal", 0),
+        ]:
+            node = analyser._extract_node_from_watermarks(zone)
+            self.assertEqual(
+                node,
+                expected_node,
+                'Wrong result if a node has memory shortage in zone "%s" (got: %s, expected %s)'
+                % (zone, node, expected_node),
+            )
+
+        self.assertEqual(
+            analyser.oom_result.mem_alloc_failure,
+            OOMAnalyser.OOMMemoryAllocFailureType.failed_below_low_watermark,
+            "Unexpected reason why the memory allocation has failed.",
+        )
+

 if __name__ == "__main__":
    unittest.main(verbosity=2)