Add analysis why the memory request failed

2023-03-02 20:43:35 +01:00 · 2023-03-02 20:43:35 +01:00 · ec8b4c2374
commit ec8b4c2374
parent beb00ce17e
3 changed files with 354 additions and 28 deletions
--- a/OOMAnalyser.html
+++ b/OOMAnalyser.html
@ -68,6 +68,22 @@ THIS PROGRAM COMES WITH NO WARRANTY
            display: block;
        }
        .js-alloc-failure--show {
            /* empty - used to hide/show detailed allocation failure analysis */
        }
        .js-alloc-failure-below-low-watermark--show {
            /* empty - used to hide/show details for failed memory allocations */
        }
        .js-alloc-failure-no-free-chunks--show {
            /* empty - used to hide/show details for failed memory allocations */
        }
        .js-alloc-failure-unknown-reason-show {
            /* empty - used to hide/show details for failed memory allocations */
        }
        .js-killed-proc-score--show {
            /* empty - used to hide/show OOM score of killed process */
        }
@ -378,16 +394,6 @@ window.onerror = function (msg, url, lineNo, columnNo, errorObj) {
                The OOM killer calculates a score for each process and terminates the process with the highest score.
            </p>
        </div>
        <p>
            The process &quot;<span class="killed_proc_name"></span>&quot;
            (PID <span class="killed_proc_pid"></span>)
            <span class="js-text--default-show js-killed-proc-score--show">
                with an OOM score of <span class="killed_proc_score"></span>
            </span>
            has been terminated. It uses <span class="killed_proc_rss_percent"></span>
            (<span class="killed_proc_total_rss_kb"></span>) of the resident memory.
        </p>
        <div class="js-text--default-show js-swap-active--show">
            <p>
                The system has <span class="system_total_ram_kb"></span> physical memory and
@ -406,6 +412,36 @@ window.onerror = function (msg, url, lineNo, columnNo, errorObj) {
                physical memory are in use.
            </p>
        </div>
        <p>
            The process &quot;<span class="killed_proc_name"></span>&quot;
            (PID <span class="killed_proc_pid"></span>)
            <span class="js-text--default-show js-killed-proc-score--show">
                with an OOM score of <span class="killed_proc_score"></span>
            </span>
            has been terminated. It uses <span class="killed_proc_rss_percent"></span>
            (<span class="killed_proc_total_rss_kb"></span>) of the resident memory.
        </p>
        <div class="js-text--default-hide js-alloc-failure--show">
            <p>
                A memory chunk of order <span class="trigger_proc_order"></span> (
                <span class="trigger_proc_requested_memory_pages"></span> /
                <span class="trigger_proc_requested_memory_pages_kb"></span> ) has been requested from the
                &quot;<span class="trigger_proc_mem_zone"></span>&quot; memory zone.
                <span class="js-text--default-hide js-alloc-failure-below-low-watermark--show">
                    The request failed because after its fulfillment the free memory would be below the memory
                    low watermark.
                </span>
                <span class="js-text--default-hide js-alloc-failure-no-free-chunks--show">
                    If this requirement were met, the free memory would still be above the low memory watermark.
                    The request failed because there is no free chunk in the current or higher order.
                </span>
                <span class="js-text--default-hide js-alloc-failure-unknown-reason-show">
                    The request failed, but the reason is unknown.
                </span>
                This analysis result is an estimate because the kernel reduces the minimum watermark in some rare cases.
            </p>
        </div>
    </div>
    <h3>Details of analysis</h3>
@ -955,6 +991,7 @@ window.onerror = function (msg, url, lineNo, columnNo, errorObj) {
        <li>Fix and rework calculation of GFP flags</li>
        <li>Add GFP flags for more kernel releases</li>
        <li>Display missing memory chunks (buddyinfo) again</li>
        <li>Add analysis why the memory request failed</li>
        <li>...</li>
    </ol>
--- a/OOMAnalyser.py
+++ b/OOMAnalyser.py
@ -132,6 +132,28 @@ class OOMEntityType:
    manual = 2
 class OOMMemoryAllocFailureType:
    """Enum to store the results why the memory allocation could have failed"""
    not_started = 0
    """Analysis not started"""
    missing_data = 1
    """Missing data to start analysis"""
    failed_below_low_watermark = 2
    """Failed, because after satisfying this request, the free memory will be below the low memory watermark"""
    failed_no_free_chunks = 3
    """Failed, because no suitable chunk is free in the current or any higher order."""
    failed_unknown_reason = 4
    """Failed, but the reason is unknown"""
    skipped_high_order_dont_trigger_oom = 5
    """"high order" requests don't trigger OOM"""
 def is_visible(element):
    return element.offsetWidth > 0 and element.offsetHeight > 0
@ -484,6 +506,11 @@ class BaseKernelConfig:
    ]
    """Elements of the process table"""
    PAGE_ALLOC_COSTLY_ORDER = 3
    """
    Requests with order > PAGE_ALLOC_COSTLY_ORDER will never trigger the OOM-killer to satisfy the request.
    """
    pstable_html = [
        "PID",
        "UID",
@ -549,6 +576,13 @@ class BaseKernelConfig:
    :type: str
    """
    ZONE_TYPES = ["DMA", "DMA32", "Normal", "HighMem", "Movable"]
    """
    List of memory zones
    @type: List(str)
    """
    def __init__(self):
        super().__init__()
@ -2670,26 +2704,9 @@ class OOMEntity:
 class OOMResult:
    """Results of an OOM analysis"""
    kconfig = BaseKernelConfig()
    """Kernel configuration"""
    details = {}
    """Extracted result"""
    oom_entity = None
    """
    State of this OOM (unknown, incomplete, ...)
    :type: OOMEntityState
    """
    oom_type = OOMEntityType.unknown
    """
    Type of this OOM (manually or automatically triggered)
    :type: OOMEntityType
    """
    error_msg = ""
    """
    Error message
@ -2697,6 +2714,9 @@ class OOMResult:
    @type: str
    """
    kconfig = BaseKernelConfig()
    """Kernel configuration"""
    kversion = None
    """
    Kernel version
@ -2704,6 +2724,19 @@ class OOMResult:
    @type: str
    """
    mem_alloc_failure = OOMMemoryAllocFailureType.not_started
    """State/result of the memory allocation failure analysis
    @see: OOMAnalyser._analyse_alloc_failure()
    """
    oom_entity = None
    """
    State of this OOM (unknown, incomplete, ...)
    :type: OOMEntityState
    """
    oom_text = None
    """
    OOM text
@ -2711,6 +2744,13 @@ class OOMResult:
    @type: str
    """
    oom_type = OOMEntityType.unknown
    """
    Type of this OOM (manually or automatically triggered)
    :type: OOMEntityType
    """
    swap_active = False
    """
    Swap space active or inactive
@ -3063,7 +3103,8 @@ class OOMAnalyser:
        Extract memory watermark information from all zones
        This function fills:
-        * OOMResult.details["_watermarks"] with [<zone>][<node>][(free|min|low|high)] = <XXX>
+        * OOMResult.details["_watermarks"] with [<zone>][<node>][(free|min|low|high)] = int
        * OOMResult.details["_watermarks"] with [<zone>][<node>][(lowmem_reserve)] = List(int)
        """
        self.oom_result.details["_watermarks"] = {}
        watermark_info = self.oom_result.details["_watermarks"]
@ -3075,9 +3116,16 @@ class OOMAnalyser:
        # Therefore, we reset the counter by one line.
        self.oom_entity.back()
        node = None
        zone = None
        for line in self.oom_entity:
            match = self.REC_WATERMARK.match(line)
            if not match:
                if line.startswith("lowmem_reserve[]:"):
                    # zone and node are defined in the previous round
                    watermark_info[zone][node]["lowmem_reserve"] = [
                        int(v) for v in line.split()[1:]
                    ]
                continue
            node = int(match.group("node"))
@ -3089,6 +3137,32 @@ class OOMAnalyser:
            for i in ["free", "min", "low", "high"]:
                watermark_info[zone][node][i] = int(match.group(i))
    def _extract_node_from_watermarks(self, zone):
        """
        Search node with memory shortage: watermark "free" < "min"
        @param str zone: Requested zone
        @return: First node with memory shortage or None if no node found
        @rtype: None|int
        """
        watermark_info = self.oom_result.details["_watermarks"]
        if zone not in watermark_info:
            debug(
                "Missing watermark info for zone {} - skip memory analysis".format(zone)
            )
            return None
        # __pragma__ ('jsiter')
        for node in watermark_info[zone]:
            if watermark_info[zone][node]["free"] < watermark_info[zone][node]["min"]:
                return int(node)
        # __pragma__ ('nojsiter')
        debug(
            "Node with current memory shortage cannot be determined - skip memory analysis"
        )
        return None
    def _gfp_hex2flags(self, hexvalue):
        """\
        Convert the hexadecimal value into flags specified by definition
@ -3170,6 +3244,114 @@ class OOMAnalyser:
        ps_index.sort(key=int)
        self.oom_result.details["_pstable_index"] = ps_index
    def _check_free_chunks(self, start_with_order, zone, node):
        """Check for at least one free chunk in the current or any higher order.
        Returns True, if at lease one suitable chunk is free.
        Returns None, if buddyinfo doesn't contain information for the requested node, order or zone
        @param int start_with_order: Start checking with this order
        @param str zone: Memory zone
        @param int node: Node number
        @rtype: None|bool
        """
        if not self.oom_result.details["_buddyinfo"]:
            return None
        buddyinfo = self.oom_result.details["_buddyinfo"]
        if zone not in buddyinfo:
            return None
        for order in range(start_with_order, self.oom_result.kconfig.MAX_ORDER):
            if order not in buddyinfo[zone]:
                break
            if node not in buddyinfo[zone][order]:
                return None
            free_chunks = buddyinfo[zone][order][node]
            if free_chunks:
                return True
        return False
    def _analyse_alloc_failure(self):
        """
        Analyse why the memory allocation could be failed.
        The code in this function is inspired by mm/page_alloc.c:__zone_watermark_ok()
        """
        self.oom_result.mem_alloc_failure = OOMMemoryAllocFailureType.not_started
        if self.oom_result.oom_type == OOMEntityType.manual:
            debug("OOM triggered manually - skip memory analysis")
            return
        if "_buddyinfo" not in self.oom_result.details:
            debug("Missing buddyinfo - skip memory analysis")
            return
        if not self.oom_result.details["_buddyinfo"]:
            debug("Empty buddyinfo - skip memory analysis")
            return
        if ("trigger_proc_order" not in self.oom_result.details) or (
            "trigger_proc_mem_zone" not in self.oom_result.details
        ):
            debug(
                "Missing trigger_proc_order and/or trigger_proc_mem_zone - skip memory analysis"
            )
            return
        if "_watermarks" not in self.oom_result.details:
            debug("Missing watermark information - skip memory analysis")
            return
        order = self.oom_result.details["trigger_proc_order"]
        zone = self.oom_result.details["trigger_proc_mem_zone"]
        watermark_info = self.oom_result.details["_watermarks"]
        # "high order" requests don't trigger OOM
        if int(order) > self.oom_result.kconfig.PAGE_ALLOC_COSTLY_ORDER:
            debug("high order requests should not trigger OOM - skip memory analysis")
            self.oom_result.mem_alloc_failure = (
                OOMMemoryAllocFailureType.skipped_high_order_dont_trigger_oom
            )
            return
        # Search node with memory shortage: watermark "free" < "min"
        node = self._extract_node_from_watermarks(zone)
        if node is None:
            return  # error cause already shown as debug message
        # the remaining code is similar to mm/page_alloc.c:__zone_watermark_ok()
        # =======================================================================
        # calculation in kB and not in pages
        free_kb = watermark_info[zone][node]["free"]
        highest_zoneidx = self.oom_result.kconfig.ZONE_TYPES.index(zone)
        lowmem_reserve = watermark_info[zone][node]["lowmem_reserve"]
        min_kb = watermark_info[zone][node]["low"]
        page_size = self.oom_result.details["_buddyinfo_pagesize_kb"]
        # reduce minimum watermark for high priority calls
        # ALLOC_HIGH == __GFP_HIGH
        gfp_mask_decimal = self.oom_result.details["_trigger_proc_gfp_mask_decimal"]
        gfp_flag_high = self.oom_result.kconfig.GFP_FLAGS["__GFP_DMA"]["_value"]
        if (gfp_mask_decimal & gfp_flag_high) == gfp_flag_high:
            min_kb -= int(min_kb / 2)
        # check watermarks, if these are not met, then a high-order request also
        # cannot go ahead even if a suitable page happened to be free.
        if free_kb <= (min_kb + (lowmem_reserve[highest_zoneidx] * page_size)):
            self.oom_result.mem_alloc_failure = (
                OOMMemoryAllocFailureType.failed_below_low_watermark
            )
            return
        # For a high-order request, check at least one suitable page is free
        if not self._check_free_chunks(order, zone, node):
            self.oom_result.mem_alloc_failure = (
                OOMMemoryAllocFailureType.failed_no_free_chunks
            )
            return
        self.oom_result.mem_alloc_failure = (
            OOMMemoryAllocFailureType.failed_unknown_reason
        )
    def _calc_pstable_values(self):
        """Set additional notes to processes listed in the process table"""
        tpid = self.oom_result.details["trigger_proc_pid"]
@ -3323,6 +3505,7 @@ class OOMAnalyser:
        self._calc_system_values()
        self._calc_trigger_process_values()
        self._calc_killed_process_values()
        self._analyse_alloc_failure()
    def analyse(self):
        """
@ -4189,6 +4372,7 @@ Out of memory: Killed process 651 (unattended-upgr) total-vm:108020kB, anon-rss:
        self._show_items()
        self._show_swap_usage()
        self._show_ram_usage()
        self._show_alloc_failure()
        # generate process table
        self._show_pstable()
@ -4198,6 +4382,29 @@ Out of memory: Killed process 651 (unattended-upgr) total-vm:108020kB, anon-rss:
        element.textContent = self.oom_result.oom_text
        self.toggle_oom(show=False)
    def _show_alloc_failure(self):
        """Show details why the memory allocation failed"""
        if (
            self.oom_result.mem_alloc_failure
            == OOMMemoryAllocFailureType.failed_below_low_watermark
        ):
            show_elements(".js-alloc-failure--show")
            show_elements(".js-alloc-failure-below-low-watermark--show")
        elif (
            self.oom_result.mem_alloc_failure
            == OOMMemoryAllocFailureType.failed_no_free_chunks
        ):
            show_elements(".js-alloc-failure--show")
            show_elements(".js-alloc-failure-no-free-chunks--show")
        elif (
            self.oom_result.mem_alloc_failure
            == OOMMemoryAllocFailureType.failed_unknown_reason
        ):
            show_elements(".js-alloc-failure--show")
            show_elements(".js-alloc-failure-unknown-reason-show")
        else:
            hide_elements(".js-alloc-failure--show")
    def _show_ram_usage(self):
        """Generate RAM usage diagram"""
        ram_title_attr = (
--- a/test.py
+++ b/test.py
@ -245,6 +245,11 @@ class TestInBrowser(TestBase):
            "99% (8343236 kBytes out of 8388604 kBytes) swap space" in explanation.text,
            "Used swap space in summary not found",
        )
        self.assertTrue(
            "The request failed because after its fulfillment the free memory would be below the memory low watermark."
            in explanation.text,
            "Memory allocation failure analysis not found",
        )
        mem_node_info = self.driver.find_element(By.CLASS_NAME, "mem_node_info")
        self.assertEqual(
@ -315,6 +320,10 @@ class TestInBrowser(TestBase):
            in explanation.text,
            "Used physical memory in summary not found",
        )
        self.assertTrue(
            "The request failed because" not in explanation.text,
            "Memory allocation failure analysis found",
        )
        mem_node_info = self.driver.find_element(By.CLASS_NAME, "mem_node_info")
        self.assertEqual(
@ -937,6 +946,10 @@ Hardware name: HP ProLiant DL385 G7, BIOS A18 12/08/2012
                'Wrong watermark level for node %s in zone "%s" (got: %d, expect %d)'
                % (node, zone, level, except_level),
            )
        node = analyser._extract_node_from_watermarks("Normal")
        self.assertTrue(
            node == 0, "Wrong node with memory shortage (got: %s, expect: 0)" % node
        )
        self.assertEqual(
            analyser.oom_result.kconfig.MAX_ORDER,
            11,  # This is a hard coded value as extracted from kernel 6.2.0
@ -944,6 +957,75 @@ Hardware name: HP ProLiant DL385 G7, BIOS A18 12/08/2012
            % analyser.oom_result.kconfig.MAX_ORDER,
        )
    def test_011_alloc_failure(self):
        """Test analysis why the memory allocation could be failed"""
        oom = OOMAnalyser.OOMEntity(OOMAnalyser.OOMDisplay.example_rhel7)
        analyser = OOMAnalyser.OOMAnalyser(oom)
        success = analyser.analyse()
        self.assertTrue(success, "OOM analysis failed")
        self.assertEqual(
            analyser.oom_result.oom_type,
            OOMAnalyser.OOMEntityType.automatic,
            "OOM triggered manually",
        )
        self.assertTrue(
            "_buddyinfo" in analyser.oom_result.details, "Missing buddyinfo"
        )
        self.assertTrue(analyser.oom_result.details["_buddyinfo"], "Empty buddyinfo")
        self.assertTrue(
            "trigger_proc_order" in analyser.oom_result.details
            and "trigger_proc_mem_zone" in analyser.oom_result.details,
            "Missing trigger_proc_order and/or trigger_proc_mem_zone",
        )
        self.assertTrue(
            "_watermarks" in analyser.oom_result.details,
            "Missing watermark information - skip memory analysis",
        )
        for zone, order, node, expected_result in [
            ("DMA", 0, 0, True),
            ("DMA", 6, 0, True),
            ("DMA32", 0, 0, True),
            ("DMA32", 10, 0, False),
            ("Normal", 0, 0, True),
            ("Normal", 0, 1, True),
            ("Normal", 6, 0, False),
            ("Normal", 6, 1, True),
            ("Normal", 7, 0, False),
            ("Normal", 7, 1, True),
            ("Normal", 9, 0, False),
            ("Normal", 9, 1, False),
        ]:
            result = analyser._check_free_chunks(order, zone, node)
            self.assertEqual(
                result,
                expected_result,
                "Wrong result of the check for free chunks with the same or higher order for Node %d, "
                'Zone "%s" and order %d (got: %s, expected %s)'
                % (node, zone, order, result, expected_result),
            )
        # Search node with memory shortage: watermark "free" < "min"
        for zone, expected_node in [
            ("DMA", None),
            ("DMA32", None),
            ("Normal", 0),
        ]:
            node = analyser._extract_node_from_watermarks(zone)
            self.assertEqual(
                node,
                expected_node,
                'Wrong result if a node has memory shortage in zone "%s" (got: %s, expected %s)'
                % (zone, node, expected_node),
            )
        self.assertEqual(
            analyser.oom_result.mem_alloc_failure,
            OOMAnalyser.OOMMemoryAllocFailureType.failed_below_low_watermark,
            "Unexpected reason why the memory allocation has failed.",
        )
 if __name__ == "__main__":
    unittest.main(verbosity=2)