]> git.ipfire.org Git - thirdparty/Python/cpython.git/commitdiff
[3.14] gh-90949: add Expat API to prevent XML deadly allocations (CVE-2025-59375...
authorBénédikt Tran <10796600+picnixz@users.noreply.github.com>
Sun, 2 Nov 2025 09:33:36 +0000 (10:33 +0100)
committerGitHub <noreply@github.com>
Sun, 2 Nov 2025 09:33:36 +0000 (09:33 +0000)
* [3.14] gh-90949: add Expat API to prevent XML deadly allocations (CVE-2025-59375) (GH-139234)

Expose the XML Expat 2.7.2 mitigation APIs to disallow use of
disproportional amounts of dynamic memory from within an Expat
parser (see CVE-2025-59375 for instance).

The exposed APIs are available on Expat parsers, that is,
parsers created by `xml.parsers.expat.ParserCreate()`, as:

- `parser.SetAllocTrackerActivationThreshold(threshold)`, and
- `parser.SetAllocTrackerMaximumAmplification(max_factor)`.

(cherry picked from commit f04bea44c37793561d753dd4ca6e7cd658137553)
(cherry picked from commit 68a1778b7721f3fb853cd3aa674f7039c2a4df36)

Doc/library/pyexpat.rst
Include/pyexpat.h
Lib/test/test_pyexpat.py
Misc/NEWS.d/next/Library/2025-09-22-14-40-11.gh-issue-90949.UM35nb.rst [new file with mode: 0644]
Modules/clinic/pyexpat.c.h
Modules/expat/pyexpatns.h
Modules/pyexpat.c

index 5506ac828e5abe7aa27f04492db3a79522810ff5..0b08a9b0dedef686ff5bf2873837c5c215b8a995 100644 (file)
@@ -72,6 +72,13 @@ The :mod:`xml.parsers.expat` module contains two functions:
    *encoding* [1]_ is given it will override the implicit or explicit encoding of the
    document.
 
+   .. _xmlparser-non-root:
+
+   Parsers created through :func:`!ParserCreate` are called "root" parsers,
+   in the sense that they do not have any parent parser attached. Non-root
+   parsers are created by :meth:`parser.ExternalEntityParserCreate
+   <xmlparser.ExternalEntityParserCreate>`.
+
    Expat can optionally do XML namespace processing for you, enabled by providing a
    value for *namespace_separator*.  The value must be a one-character string; a
    :exc:`ValueError` will be raised if the string has an illegal length (``None``
@@ -231,6 +238,55 @@ XMLParser Objects
    .. versionadded:: 3.13
 
 
+:class:`!xmlparser` objects have the following methods to mitigate some
+common XML vulnerabilities.
+
+.. method:: xmlparser.SetAllocTrackerActivationThreshold(threshold, /)
+
+   Sets the number of allocated bytes of dynamic memory needed to activate
+   protection against disproportionate use of RAM.
+
+   By default, parser objects have an allocation activation threshold of 64 MiB,
+   or equivalently 67,108,864 bytes.
+
+   An :exc:`ExpatError` is raised if this method is called on a
+   |xml-non-root-parser| parser.
+   The corresponding :attr:`~ExpatError.lineno` and :attr:`~ExpatError.offset`
+   should not be used as they may have no special meaning.
+
+   .. versionadded:: next
+
+.. method:: xmlparser.SetAllocTrackerMaximumAmplification(max_factor, /)
+
+   Sets the maximum amplification factor between direct input and bytes
+   of dynamic memory allocated.
+
+   The amplification factor is calculated as ``allocated / direct``
+   while parsing, where ``direct`` is the number of bytes read from
+   the primary document in parsing and ``allocated`` is the number
+   of bytes of dynamic memory allocated in the parser hierarchy.
+
+   The *max_factor* value must be a non-NaN :class:`float` value greater than
+   or equal to 1.0. Amplification factors greater than 100.0 can be observed
+   near the start of parsing even with benign files in practice. In particular,
+   the activation threshold should be carefully chosen to avoid false positives.
+
+   By default, parser objects have a maximum amplification factor of 100.0.
+
+   An :exc:`ExpatError` is raised if this method is called on a
+   |xml-non-root-parser| parser or if *max_factor* is outside the valid range.
+   The corresponding :attr:`~ExpatError.lineno` and :attr:`~ExpatError.offset`
+   should not be used as they may have no special meaning.
+
+   .. note::
+
+      The maximum amplification factor is only considered if the threshold
+      that can be adjusted by :meth:`.SetAllocTrackerActivationThreshold`
+      is exceeded.
+
+   .. versionadded:: next
+
+
 :class:`xmlparser` objects have the following attributes:
 
 
@@ -954,3 +1010,4 @@ The ``errors`` module has the following attributes:
    not. See https://www.w3.org/TR/2006/REC-xml11-20060816/#NT-EncodingDecl
    and https://www.iana.org/assignments/character-sets/character-sets.xhtml.
 
+.. |xml-non-root-parser| replace:: :ref:`non-root <xmlparser-non-root>`
index 9824d099c3df7d08083ee6f9232dedcbe9d846a0..04548b7684a2fda5ff17203943133f1d73a3cd9a 100644 (file)
@@ -52,6 +52,11 @@ struct PyExpat_CAPI
     int (*SetHashSalt)(XML_Parser parser, unsigned long hash_salt);
     /* might be NULL for expat < 2.6.0 */
     XML_Bool (*SetReparseDeferralEnabled)(XML_Parser parser, XML_Bool enabled);
+    /* might be NULL for expat < 2.7.2 */
+    XML_Bool (*SetAllocTrackerActivationThreshold)(
+        XML_Parser parser, unsigned long long activationThresholdBytes);
+    XML_Bool (*SetAllocTrackerMaximumAmplification)(
+        XML_Parser parser, float maxAmplificationFactor);
     /* always add new stuff to the end! */
 };
 
index 5c8189b33c32859ffa1cc14d329fc94c0bc152f0..daeaa38a3c508557ca35873a946fe722622a5423 100644 (file)
@@ -1,7 +1,10 @@
 # XXX TypeErrors on calling handlers, or on bad return values from a
 # handler, are obscure and unhelpful.
 
+import abc
+import functools
 import os
+import re
 import sys
 import sysconfig
 import textwrap
@@ -9,7 +12,7 @@ import unittest
 import traceback
 from io import BytesIO
 from test import support
-from test.support import os_helper
+from test.support import import_helper, os_helper
 
 from xml.parsers import expat
 from xml.parsers.expat import errors
@@ -863,5 +866,199 @@ class ReparseDeferralTest(unittest.TestCase):
         self.assertEqual(started, ['doc'])
 
 
+class AttackProtectionTestBase(abc.ABC):
+    """
+    Base class for testing protections against XML payloads with
+    disproportionate amplification.
+
+    The protections being tested should detect and prevent attacks
+    that leverage disproportionate amplification from small inputs.
+    """
+
+    @staticmethod
+    def exponential_expansion_payload(*, nrows, ncols, text='.'):
+        """Create a billion laughs attack payload.
+
+        Be careful: the number of total items is pow(n, k), thereby
+        requiring at least pow(ncols, nrows) * sizeof(text) memory!
+        """
+        template = textwrap.dedent(f"""\
+            <?xml version="1.0"?>
+            <!DOCTYPE doc [
+                <!ENTITY row0 "{text}">
+                <!ELEMENT doc (#PCDATA)>
+            {{body}}
+            ]>
+            <doc>&row{nrows};</doc>
+        """).rstrip()
+
+        body = '\n'.join(
+            f'<!ENTITY row{i + 1} "{f"&row{i};" * ncols}">'
+            for i in range(nrows)
+        )
+        body = textwrap.indent(body, ' ' * 4)
+        return template.format(body=body)
+
+    def test_payload_generation(self):
+        # self-test for exponential_expansion_payload()
+        payload = self.exponential_expansion_payload(nrows=2, ncols=3)
+        self.assertEqual(payload, textwrap.dedent("""\
+            <?xml version="1.0"?>
+            <!DOCTYPE doc [
+                <!ENTITY row0 ".">
+                <!ELEMENT doc (#PCDATA)>
+                <!ENTITY row1 "&row0;&row0;&row0;">
+                <!ENTITY row2 "&row1;&row1;&row1;">
+            ]>
+            <doc>&row2;</doc>
+        """).rstrip())
+
+    def assert_root_parser_failure(self, func, /, *args, **kwargs):
+        """Check that func(*args, **kwargs) is invalid for a sub-parser."""
+        msg = "parser must be a root parser"
+        self.assertRaisesRegex(expat.ExpatError, msg, func, *args, **kwargs)
+
+    @abc.abstractmethod
+    def assert_rejected(self, func, /, *args, **kwargs):
+        """Assert that func(*args, **kwargs) triggers the attack protection.
+
+        Note: this method must ensure that the attack protection being tested
+        is the one that is actually triggered at runtime, e.g., by matching
+        the exact error message.
+        """
+
+    @abc.abstractmethod
+    def set_activation_threshold(self, parser, threshold):
+        """Set the activation threshold for the tested protection."""
+
+    @abc.abstractmethod
+    def set_maximum_amplification(self, parser, max_factor):
+        """Set the maximum amplification factor for the tested protection."""
+
+    @abc.abstractmethod
+    def test_set_activation_threshold__threshold_reached(self):
+        """Test when the activation threshold is exceeded."""
+
+    @abc.abstractmethod
+    def test_set_activation_threshold__threshold_not_reached(self):
+        """Test when the activation threshold is not exceeded."""
+
+    def test_set_activation_threshold__invalid_threshold_type(self):
+        parser = expat.ParserCreate()
+        setter = functools.partial(self.set_activation_threshold, parser)
+
+        self.assertRaises(TypeError, setter, 1.0)
+        self.assertRaises(TypeError, setter, -1.5)
+        self.assertRaises(ValueError, setter, -5)
+
+    def test_set_activation_threshold__invalid_threshold_range(self):
+        _testcapi = import_helper.import_module("_testcapi")
+        parser = expat.ParserCreate()
+        setter = functools.partial(self.set_activation_threshold, parser)
+
+        self.assertRaises(OverflowError, setter, _testcapi.ULLONG_MAX + 1)
+
+    def test_set_activation_threshold__fail_for_subparser(self):
+        parser = expat.ParserCreate()
+        subparser = parser.ExternalEntityParserCreate(None)
+        setter = functools.partial(self.set_activation_threshold, subparser)
+        self.assert_root_parser_failure(setter, 12345)
+
+    @abc.abstractmethod
+    def test_set_maximum_amplification__amplification_exceeded(self):
+        """Test when the amplification factor is exceeded."""
+
+    @abc.abstractmethod
+    def test_set_maximum_amplification__amplification_not_exceeded(self):
+        """Test when the amplification factor is not exceeded."""
+
+    def test_set_maximum_amplification__infinity(self):
+        inf = float('inf')  # an 'inf' threshold is allowed by Expat
+        parser = expat.ParserCreate()
+        self.assertIsNone(self.set_maximum_amplification(parser, inf))
+
+    def test_set_maximum_amplification__invalid_max_factor_type(self):
+        parser = expat.ParserCreate()
+        setter = functools.partial(self.set_maximum_amplification, parser)
+
+        self.assertRaises(TypeError, setter, None)
+        self.assertRaises(TypeError, setter, 'abc')
+
+    def test_set_maximum_amplification__invalid_max_factor_range(self):
+        parser = expat.ParserCreate()
+        setter = functools.partial(self.set_maximum_amplification, parser)
+
+        msg = re.escape("'max_factor' must be at least 1.0")
+        self.assertRaisesRegex(expat.ExpatError, msg, setter, float('nan'))
+        self.assertRaisesRegex(expat.ExpatError, msg, setter, 0.99)
+
+    def test_set_maximum_amplification__fail_for_subparser(self):
+        parser = expat.ParserCreate()
+        subparser = parser.ExternalEntityParserCreate(None)
+        setter = functools.partial(self.set_maximum_amplification, subparser)
+        self.assert_root_parser_failure(setter, 123.45)
+
+
+@unittest.skipIf(expat.version_info < (2, 7, 2), "requires Expat >= 2.7.2")
+class MemoryProtectionTest(AttackProtectionTestBase, unittest.TestCase):
+
+    # NOTE: with the default Expat configuration, the billion laughs protection
+    # may hit before the allocation limiter if exponential_expansion_payload()
+    # is not carefully parametrized. As such, the payloads should be chosen so
+    # that either the allocation limiter is hit before other protections are
+    # triggered or no protection at all is triggered.
+
+    def assert_rejected(self, func, /, *args, **kwargs):
+        """Check that func(*args, **kwargs) hits the allocation limit."""
+        msg = r"out of memory: line \d+, column \d+"
+        self.assertRaisesRegex(expat.ExpatError, msg, func, *args, **kwargs)
+
+    def set_activation_threshold(self, parser, threshold):
+        return parser.SetAllocTrackerActivationThreshold(threshold)
+
+    def set_maximum_amplification(self, parser, max_factor):
+        return parser.SetAllocTrackerMaximumAmplification(max_factor)
+
+    def test_set_activation_threshold__threshold_reached(self):
+        parser = expat.ParserCreate()
+        # Choose a threshold expected to be always reached.
+        self.set_activation_threshold(parser, 3)
+        # Check that the threshold is reached by choosing a small factor
+        # and a payload whose peak amplification factor exceeds it.
+        self.assertIsNone(self.set_maximum_amplification(parser, 1.0))
+        payload = self.exponential_expansion_payload(ncols=10, nrows=4)
+        self.assert_rejected(parser.Parse, payload, True)
+
+    def test_set_activation_threshold__threshold_not_reached(self):
+        parser = expat.ParserCreate()
+        # Choose a threshold expected to be never reached.
+        self.set_activation_threshold(parser, pow(10, 5))
+        # Check that the threshold is reached by choosing a small factor
+        # and a payload whose peak amplification factor exceeds it.
+        self.assertIsNone(self.set_maximum_amplification(parser, 1.0))
+        payload = self.exponential_expansion_payload(ncols=10, nrows=4)
+        self.assertIsNotNone(parser.Parse(payload, True))
+
+    def test_set_maximum_amplification__amplification_exceeded(self):
+        parser = expat.ParserCreate()
+        # Unconditionally enable maximum activation factor.
+        self.set_activation_threshold(parser, 0)
+        # Choose a max amplification factor expected to always be exceeded.
+        self.assertIsNone(self.set_maximum_amplification(parser, 1.0))
+        # Craft a payload for which the peak amplification factor is > 1.0.
+        payload = self.exponential_expansion_payload(ncols=1, nrows=2)
+        self.assert_rejected(parser.Parse, payload, True)
+
+    def test_set_maximum_amplification__amplification_not_exceeded(self):
+        parser = expat.ParserCreate()
+        # Unconditionally enable maximum activation factor.
+        self.set_activation_threshold(parser, 0)
+        # Choose a max amplification factor expected to never be exceeded.
+        self.assertIsNone(self.set_maximum_amplification(parser, 1e4))
+        # Craft a payload for which the peak amplification factor is < 1e4.
+        payload = self.exponential_expansion_payload(ncols=1, nrows=2)
+        self.assertIsNotNone(parser.Parse(payload, True))
+
+
 if __name__ == "__main__":
     unittest.main()
diff --git a/Misc/NEWS.d/next/Library/2025-09-22-14-40-11.gh-issue-90949.UM35nb.rst b/Misc/NEWS.d/next/Library/2025-09-22-14-40-11.gh-issue-90949.UM35nb.rst
new file mode 100644 (file)
index 0000000..5611f33
--- /dev/null
@@ -0,0 +1,5 @@
+Add :meth:`~xml.parsers.expat.xmlparser.SetAllocTrackerActivationThreshold`
+and :meth:`~xml.parsers.expat.xmlparser.SetAllocTrackerMaximumAmplification`
+to :ref:`xmlparser <xmlparser-objects>` objects to prevent use of
+disproportional amounts of dynamic memory from within an Expat parser.
+Patch by Bénédikt Tran.
index 13210e3be0f747d189352a52e1e8ddf3a04eb8a3..e178547060446e4ac0e0e5478bb6ed70125485d2 100644 (file)
@@ -6,6 +6,7 @@ preserve
 #  include "pycore_gc.h"          // PyGC_Head
 #  include "pycore_runtime.h"     // _Py_SINGLETON()
 #endif
+#include "pycore_long.h"          // _PyLong_UnsignedLongLong_Converter()
 #include "pycore_modsupport.h"    // _PyArg_UnpackKeywords()
 
 PyDoc_STRVAR(pyexpat_xmlparser_SetReparseDeferralEnabled__doc__,
@@ -408,6 +409,131 @@ exit:
 
 #endif /* (XML_COMBINED_VERSION >= 19505) */
 
+#if (XML_COMBINED_VERSION >= 20702)
+
+PyDoc_STRVAR(pyexpat_xmlparser_SetAllocTrackerActivationThreshold__doc__,
+"SetAllocTrackerActivationThreshold($self, threshold, /)\n"
+"--\n"
+"\n"
+"Sets the number of allocated bytes of dynamic memory needed to activate protection against disproportionate use of RAM.\n"
+"\n"
+"By default, parser objects have an allocation activation threshold of 64 MiB.");
+
+#define PYEXPAT_XMLPARSER_SETALLOCTRACKERACTIVATIONTHRESHOLD_METHODDEF    \
+    {"SetAllocTrackerActivationThreshold", _PyCFunction_CAST(pyexpat_xmlparser_SetAllocTrackerActivationThreshold), METH_METHOD|METH_FASTCALL|METH_KEYWORDS, pyexpat_xmlparser_SetAllocTrackerActivationThreshold__doc__},
+
+static PyObject *
+pyexpat_xmlparser_SetAllocTrackerActivationThreshold_impl(xmlparseobject *self,
+                                                          PyTypeObject *cls,
+                                                          unsigned long long threshold);
+
+static PyObject *
+pyexpat_xmlparser_SetAllocTrackerActivationThreshold(PyObject *self, PyTypeObject *cls, PyObject *const *args, Py_ssize_t nargs, PyObject *kwnames)
+{
+    PyObject *return_value = NULL;
+    #if defined(Py_BUILD_CORE) && !defined(Py_BUILD_CORE_MODULE)
+    #  define KWTUPLE (PyObject *)&_Py_SINGLETON(tuple_empty)
+    #else
+    #  define KWTUPLE NULL
+    #endif
+
+    static const char * const _keywords[] = {"", NULL};
+    static _PyArg_Parser _parser = {
+        .keywords = _keywords,
+        .fname = "SetAllocTrackerActivationThreshold",
+        .kwtuple = KWTUPLE,
+    };
+    #undef KWTUPLE
+    PyObject *argsbuf[1];
+    unsigned long long threshold;
+
+    args = _PyArg_UnpackKeywords(args, nargs, NULL, kwnames, &_parser,
+            /*minpos*/ 1, /*maxpos*/ 1, /*minkw*/ 0, /*varpos*/ 0, argsbuf);
+    if (!args) {
+        goto exit;
+    }
+    if (!_PyLong_UnsignedLongLong_Converter(args[0], &threshold)) {
+        goto exit;
+    }
+    return_value = pyexpat_xmlparser_SetAllocTrackerActivationThreshold_impl((xmlparseobject *)self, cls, threshold);
+
+exit:
+    return return_value;
+}
+
+#endif /* (XML_COMBINED_VERSION >= 20702) */
+
+#if (XML_COMBINED_VERSION >= 20702)
+
+PyDoc_STRVAR(pyexpat_xmlparser_SetAllocTrackerMaximumAmplification__doc__,
+"SetAllocTrackerMaximumAmplification($self, max_factor, /)\n"
+"--\n"
+"\n"
+"Sets the maximum amplification factor between direct input and bytes of dynamic memory allocated.\n"
+"\n"
+"The amplification factor is calculated as \"allocated / direct\" while parsing,\n"
+"where \"direct\" is the number of bytes read from the primary document in parsing\n"
+"and \"allocated\" is the number of bytes of dynamic memory allocated in the parser\n"
+"hierarchy.\n"
+"\n"
+"The \'max_factor\' value must be a non-NaN floating point value greater than\n"
+"or equal to 1.0. Amplification factors greater than 100.0 can be observed\n"
+"near the start of parsing even with benign files in practice. In particular,\n"
+"the activation threshold should be carefully chosen to avoid false positives.\n"
+"\n"
+"By default, parser objects have a maximum amplification factor of 100.0.");
+
+#define PYEXPAT_XMLPARSER_SETALLOCTRACKERMAXIMUMAMPLIFICATION_METHODDEF    \
+    {"SetAllocTrackerMaximumAmplification", _PyCFunction_CAST(pyexpat_xmlparser_SetAllocTrackerMaximumAmplification), METH_METHOD|METH_FASTCALL|METH_KEYWORDS, pyexpat_xmlparser_SetAllocTrackerMaximumAmplification__doc__},
+
+static PyObject *
+pyexpat_xmlparser_SetAllocTrackerMaximumAmplification_impl(xmlparseobject *self,
+                                                           PyTypeObject *cls,
+                                                           float max_factor);
+
+static PyObject *
+pyexpat_xmlparser_SetAllocTrackerMaximumAmplification(PyObject *self, PyTypeObject *cls, PyObject *const *args, Py_ssize_t nargs, PyObject *kwnames)
+{
+    PyObject *return_value = NULL;
+    #if defined(Py_BUILD_CORE) && !defined(Py_BUILD_CORE_MODULE)
+    #  define KWTUPLE (PyObject *)&_Py_SINGLETON(tuple_empty)
+    #else
+    #  define KWTUPLE NULL
+    #endif
+
+    static const char * const _keywords[] = {"", NULL};
+    static _PyArg_Parser _parser = {
+        .keywords = _keywords,
+        .fname = "SetAllocTrackerMaximumAmplification",
+        .kwtuple = KWTUPLE,
+    };
+    #undef KWTUPLE
+    PyObject *argsbuf[1];
+    float max_factor;
+
+    args = _PyArg_UnpackKeywords(args, nargs, NULL, kwnames, &_parser,
+            /*minpos*/ 1, /*maxpos*/ 1, /*minkw*/ 0, /*varpos*/ 0, argsbuf);
+    if (!args) {
+        goto exit;
+    }
+    if (PyFloat_CheckExact(args[0])) {
+        max_factor = (float) (PyFloat_AS_DOUBLE(args[0]));
+    }
+    else
+    {
+        max_factor = (float) PyFloat_AsDouble(args[0]);
+        if (max_factor == -1.0 && PyErr_Occurred()) {
+            goto exit;
+        }
+    }
+    return_value = pyexpat_xmlparser_SetAllocTrackerMaximumAmplification_impl((xmlparseobject *)self, cls, max_factor);
+
+exit:
+    return return_value;
+}
+
+#endif /* (XML_COMBINED_VERSION >= 20702) */
+
 PyDoc_STRVAR(pyexpat_ParserCreate__doc__,
 "ParserCreate($module, /, encoding=None, namespace_separator=None,\n"
 "             intern=<unrepresentable>)\n"
@@ -552,4 +678,12 @@ exit:
 #ifndef PYEXPAT_XMLPARSER_USEFOREIGNDTD_METHODDEF
     #define PYEXPAT_XMLPARSER_USEFOREIGNDTD_METHODDEF
 #endif /* !defined(PYEXPAT_XMLPARSER_USEFOREIGNDTD_METHODDEF) */
-/*[clinic end generated code: output=4dbdc959c67dc2d5 input=a9049054013a1b77]*/
+
+#ifndef PYEXPAT_XMLPARSER_SETALLOCTRACKERACTIVATIONTHRESHOLD_METHODDEF
+    #define PYEXPAT_XMLPARSER_SETALLOCTRACKERACTIVATIONTHRESHOLD_METHODDEF
+#endif /* !defined(PYEXPAT_XMLPARSER_SETALLOCTRACKERACTIVATIONTHRESHOLD_METHODDEF) */
+
+#ifndef PYEXPAT_XMLPARSER_SETALLOCTRACKERMAXIMUMAMPLIFICATION_METHODDEF
+    #define PYEXPAT_XMLPARSER_SETALLOCTRACKERMAXIMUMAMPLIFICATION_METHODDEF
+#endif /* !defined(PYEXPAT_XMLPARSER_SETALLOCTRACKERMAXIMUMAMPLIFICATION_METHODDEF) */
+/*[clinic end generated code: output=e73935658c04c83e input=a9049054013a1b77]*/
index 8ee03ef079281576658ef714533dcdc400d41e79..fc6b482d587e0d647baaa3a65d48662a25d2ae10 100644 (file)
@@ -82,6 +82,8 @@
 #define XmlPrologStateInit              PyExpat_XmlPrologStateInit
 #define XmlPrologStateInitExternalEntity    PyExpat_XmlPrologStateInitExternalEntity
 #define XML_ResumeParser                PyExpat_XML_ResumeParser
+#define XML_SetAllocTrackerActivationThreshold PyExpat_XML_SetAllocTrackerActivationThreshold
+#define XML_SetAllocTrackerMaximumAmplification PyExpat_XML_SetAllocTrackerMaximumAmplification
 #define XML_SetAttlistDeclHandler       PyExpat_XML_SetAttlistDeclHandler
 #define XML_SetBase                     PyExpat_XML_SetBase
 #define XML_SetBillionLaughsAttackProtectionActivationThreshold PyExpat_XML_SetBillionLaughsAttackProtectionActivationThreshold
index d55ad3cdeef2a8af4107303a52c6810e8bd13739..009af9539f2f40acd60946505433ae6e3e71c27d 100644 (file)
@@ -9,6 +9,8 @@
 
 #include <stdbool.h>
 #include <stddef.h>               // offsetof()
+
+#include "expat_config.h"
 #include "expat.h"
 #include "pyexpat.h"
 
@@ -129,50 +131,89 @@ CALL_XML_HANDLER_SETTER(const struct HandlerInfo *handler_info,
     setter(xml_parser, xml_handler);
 }
 
+static int
+set_xml_error_attr_code(PyObject *err, enum XML_Error code)
+{
+    PyObject *v = PyLong_FromLong((long)code);
+    int ok = v != NULL && PyObject_SetAttr(err, &_Py_ID(code), v) != -1;
+    Py_XDECREF(v);
+    return ok;
+}
+
 /* Set an integer attribute on the error object; return true on success,
  * false on an exception.
  */
 static int
-set_error_attr(PyObject *err, const char *name, int value)
+set_xml_error_attr_location(PyObject *err, const char *name, XML_Size value)
 {
-    PyObject *v = PyLong_FromLong(value);
+    PyObject *v = PyLong_FromSize_t((size_t)value);
+    int ok = v != NULL && PyObject_SetAttrString(err, name, v) != -1;
+    Py_XDECREF(v);
+    return ok;
+}
 
-    if (v == NULL || PyObject_SetAttrString(err, name, v) == -1) {
-        Py_XDECREF(v);
-        return 0;
+
+static PyObject *
+set_xml_error(pyexpat_state *state,
+              enum XML_Error code, XML_Size lineno, XML_Size column,
+              const char *errmsg)
+{
+    PyObject *arg;
+    if (errmsg == NULL) {
+        arg = PyUnicode_FromFormat(
+            "%s: line %zu, column %zu",
+            XML_ErrorString(code),
+            (size_t)lineno, (size_t)column
+        );
     }
-    Py_DECREF(v);
-    return 1;
+    else {
+        arg = PyUnicode_FromStringAndSize(errmsg, strlen(errmsg));
+    }
+    if (arg == NULL) {
+        return NULL;
+    }
+    PyObject *res = PyObject_CallOneArg(state->error, arg);
+    Py_DECREF(arg);
+    if (
+        res != NULL
+        && set_xml_error_attr_code(res, code)
+        && set_xml_error_attr_location(res, "lineno", lineno)
+        && set_xml_error_attr_location(res, "offset", column)
+    ) {
+        PyErr_SetObject(state->error, res);
+    }
+    Py_XDECREF(res);
+    return NULL;
 }
 
+#define SET_XML_ERROR(STATE, SELF, CODE, ERRMSG)                    \
+    do {                                                            \
+        XML_Parser parser = SELF->itself;                           \
+        assert(parser != NULL);                                     \
+        XML_Size lineno = XML_GetCurrentLineNumber(parser);         \
+        XML_Size column = XML_GetCurrentColumnNumber(parser);       \
+        (void)set_xml_error(state, CODE, lineno, column, ERRMSG);   \
+    } while (0)
+
 /* Build and set an Expat exception, including positioning
  * information.  Always returns NULL.
  */
 static PyObject *
 set_error(pyexpat_state *state, xmlparseobject *self, enum XML_Error code)
 {
-    PyObject *err;
-    PyObject *buffer;
-    XML_Parser parser = self->itself;
-    int lineno = XML_GetErrorLineNumber(parser);
-    int column = XML_GetErrorColumnNumber(parser);
+    SET_XML_ERROR(state, self, code, NULL);
+    return NULL;
+}
 
-    buffer = PyUnicode_FromFormat("%s: line %i, column %i",
-                                  XML_ErrorString(code), lineno, column);
-    if (buffer == NULL)
-        return NULL;
-    err = PyObject_CallOneArg(state->error, buffer);
-    Py_DECREF(buffer);
-    if (  err != NULL
-          && set_error_attr(err, "code", code)
-          && set_error_attr(err, "offset", column)
-          && set_error_attr(err, "lineno", lineno)) {
-        PyErr_SetObject(state->error, err);
-    }
-    Py_XDECREF(err);
+static PyObject *
+set_invalid_arg(pyexpat_state *state, xmlparseobject *self, const char *errmsg)
+{
+    SET_XML_ERROR(state, self, XML_ERROR_INVALID_ARGUMENT, errmsg);
     return NULL;
 }
 
+#undef SET_XML_ERROR
+
 static int
 have_handler(xmlparseobject *self, int type)
 {
@@ -1143,6 +1184,112 @@ pyexpat_xmlparser_UseForeignDTD_impl(xmlparseobject *self, PyTypeObject *cls,
 }
 #endif
 
+#if XML_COMBINED_VERSION >= 20702
+static PyObject *
+set_activation_threshold(xmlparseobject *self,
+                         PyTypeObject *cls,
+                         unsigned long long threshold,
+                         XML_Bool (*setter)(XML_Parser, unsigned long long))
+{
+    assert(self->itself != NULL);
+    if (setter(self->itself, threshold) == XML_TRUE) {
+        Py_RETURN_NONE;
+    }
+    // The setter fails if self->itself is NULL (which is not possible here)
+    // or is a non-root parser, which currently only happens for parsers
+    // created by ExternalEntityParserCreate().
+    pyexpat_state *state = PyType_GetModuleState(cls);
+    return set_invalid_arg(state, self, "parser must be a root parser");
+}
+
+static PyObject *
+set_maximum_amplification(xmlparseobject *self,
+                          PyTypeObject *cls,
+                          float max_factor,
+                          XML_Bool (*setter)(XML_Parser, float))
+{
+    assert(self->itself != NULL);
+    if (setter(self->itself, max_factor) == XML_TRUE) {
+        Py_RETURN_NONE;
+    }
+    // The setter fails if self->itself is NULL (which is not possible here),
+    // is a non-root parser, which currently only happens for parsers created
+    // by ExternalEntityParserCreate(), or if 'max_factor' is NaN or < 1.0.
+    pyexpat_state *state = PyType_GetModuleState(cls);
+    // Note: Expat has no API to determine whether a parser is a root parser,
+    // and since the Expat functions for defining the various maximum allowed
+    // amplifcation factors fail when a bad parser or an out-of-range factor
+    // is given without specifying which check failed, we check whether the
+    // factor is out-of-range to improve the error message. See also gh-90949.
+    const char *message = (isnan(max_factor) || max_factor < 1.0f)
+          ? "'max_factor' must be at least 1.0"
+          : "parser must be a root parser";
+    return set_invalid_arg(state, self, message);
+}
+#endif
+
+#if XML_COMBINED_VERSION >= 20702
+/*[clinic input]
+pyexpat.xmlparser.SetAllocTrackerActivationThreshold
+
+    cls: defining_class
+    threshold: unsigned_long_long
+    /
+
+Sets the number of allocated bytes of dynamic memory needed to activate protection against disproportionate use of RAM.
+
+By default, parser objects have an allocation activation threshold of 64 MiB.
+[clinic start generated code]*/
+
+static PyObject *
+pyexpat_xmlparser_SetAllocTrackerActivationThreshold_impl(xmlparseobject *self,
+                                                          PyTypeObject *cls,
+                                                          unsigned long long threshold)
+/*[clinic end generated code: output=bed7e93207ba08c5 input=9c706b75c18e4ea1]*/
+{
+    return set_activation_threshold(
+        self, cls, threshold,
+        XML_SetAllocTrackerActivationThreshold
+    );
+}
+#endif
+
+#if XML_COMBINED_VERSION >= 20702
+/*[clinic input]
+pyexpat.xmlparser.SetAllocTrackerMaximumAmplification
+
+    cls: defining_class
+    max_factor: float
+    /
+
+Sets the maximum amplification factor between direct input and bytes of dynamic memory allocated.
+
+The amplification factor is calculated as "allocated / direct" while parsing,
+where "direct" is the number of bytes read from the primary document in parsing
+and "allocated" is the number of bytes of dynamic memory allocated in the parser
+hierarchy.
+
+The 'max_factor' value must be a non-NaN floating point value greater than
+or equal to 1.0. Amplification factors greater than 100.0 can be observed
+near the start of parsing even with benign files in practice. In particular,
+the activation threshold should be carefully chosen to avoid false positives.
+
+By default, parser objects have a maximum amplification factor of 100.0.
+[clinic start generated code]*/
+
+static PyObject *
+pyexpat_xmlparser_SetAllocTrackerMaximumAmplification_impl(xmlparseobject *self,
+                                                           PyTypeObject *cls,
+                                                           float max_factor)
+/*[clinic end generated code: output=6e44bd48c9b112a0 input=918b9266b490a722]*/
+{
+    return set_maximum_amplification(
+        self, cls, max_factor,
+        XML_SetAllocTrackerMaximumAmplification
+    );
+}
+#endif
+
 static struct PyMethodDef xmlparse_methods[] = {
     PYEXPAT_XMLPARSER_PARSE_METHODDEF
     PYEXPAT_XMLPARSER_PARSEFILE_METHODDEF
@@ -1151,9 +1298,9 @@ static struct PyMethodDef xmlparse_methods[] = {
     PYEXPAT_XMLPARSER_GETINPUTCONTEXT_METHODDEF
     PYEXPAT_XMLPARSER_EXTERNALENTITYPARSERCREATE_METHODDEF
     PYEXPAT_XMLPARSER_SETPARAMENTITYPARSING_METHODDEF
-#if XML_COMBINED_VERSION >= 19505
     PYEXPAT_XMLPARSER_USEFOREIGNDTD_METHODDEF
-#endif
+    PYEXPAT_XMLPARSER_SETALLOCTRACKERACTIVATIONTHRESHOLD_METHODDEF
+    PYEXPAT_XMLPARSER_SETALLOCTRACKERMAXIMUMAMPLIFICATION_METHODDEF
     PYEXPAT_XMLPARSER_SETREPARSEDEFERRALENABLED_METHODDEF
     PYEXPAT_XMLPARSER_GETREPARSEDEFERRALENABLED_METHODDEF
     {NULL, NULL}  /* sentinel */
@@ -2166,6 +2313,13 @@ pyexpat_exec(PyObject *mod)
 #else
     capi->SetReparseDeferralEnabled = NULL;
 #endif
+#if XML_COMBINED_VERSION >= 20702
+    capi->SetAllocTrackerActivationThreshold = XML_SetAllocTrackerActivationThreshold;
+    capi->SetAllocTrackerMaximumAmplification = XML_SetAllocTrackerMaximumAmplification;
+#else
+    capi->SetAllocTrackerActivationThreshold = NULL;
+    capi->SetAllocTrackerMaximumAmplification = NULL;
+#endif
 
     /* export using capsule */
     PyObject *capi_object = PyCapsule_New(capi, PyExpat_CAPSULE_NAME,