]> git.ipfire.org Git - thirdparty/Python/cpython.git/commitdiff
gh-106931: Intern Statically Allocated Strings Globally (gh-107272)
authorEric Snow <ericsnowcurrently@gmail.com>
Thu, 27 Jul 2023 19:56:59 +0000 (13:56 -0600)
committerGitHub <noreply@github.com>
Thu, 27 Jul 2023 19:56:59 +0000 (13:56 -0600)
We tried this before with a dict and for all interned strings.  That ran into problems due to interpreter isolation.  However, exclusively using a per-interpreter cache caused some inconsistency that can eliminate the benefit of interning.  Here we circle back to using a global cache, but only for statically allocated strings.  We also use a more-basic _Py_hashtable_t for that global cache instead of a dict.

Ideally we would only have the global cache, but the optional isolation of each interpreter's allocator means that a non-static string object must not outlive its interpreter.  Thus we would have to store a copy of each such interned string in the global cache, tied to the main interpreter.

Include/cpython/unicodeobject.h
Include/internal/pycore_global_objects.h
Include/internal/pycore_hashtable.h
Include/internal/pycore_runtime.h
Include/internal/pycore_runtime_init.h
Lib/test/test_sys.py
Misc/NEWS.d/next/Core and Builtins/2023-07-25-15-29-26.gh-issue-106931.kKU1le.rst [new file with mode: 0644]
Objects/unicodeobject.c
Python/hashtable.c
Tools/build/deepfreeze.py

index fa00b350a799f87d4a2157288f01601db34e07d1..859ab7178e920ac7a1a07cb02e5ccf679686f044 100644 (file)
@@ -140,9 +140,11 @@ typedef struct {
            and the kind is PyUnicode_1BYTE_KIND. If ascii is set and compact is
            set, use the PyASCIIObject structure. */
         unsigned int ascii:1;
+        /* The object is statically allocated. */
+        unsigned int statically_allocated:1;
         /* Padding to ensure that PyUnicode_DATA() is always aligned to
            4 bytes (see issue #19537 on m68k). */
-        unsigned int :25;
+        unsigned int :24;
     } state;
 } PyASCIIObject;
 
index 5a3fb132c745ab2ec64efb1221736a4723fe3c82..442f8516278b023a8603cd22fa39f7a9aa4dd3a9 100644 (file)
@@ -8,6 +8,7 @@ extern "C" {
 #  error "this header requires Py_BUILD_CORE define"
 #endif
 
+#include "pycore_hashtable.h"       // _Py_hashtable_t
 #include "pycore_gc.h"              // PyGC_Head
 #include "pycore_global_strings.h"  // struct _Py_global_strings
 #include "pycore_hamt.h"            // PyHamtNode_Bitmap
@@ -28,6 +29,11 @@ extern "C" {
 #define _Py_SINGLETON(NAME) \
     _Py_GLOBAL_OBJECT(singletons.NAME)
 
+struct _Py_cached_objects {
+    // XXX We could statically allocate the hashtable.
+    _Py_hashtable_t *interned_strings;
+};
+
 struct _Py_static_objects {
     struct {
         /* Small integers are preallocated in this array so that they
index 6501ab14d27684208e0c25d07f5752b6651095f0..f57978a8d614fbedb9d9b0ee858bd9732745b809 100644 (file)
@@ -106,6 +106,7 @@ PyAPI_FUNC(int) _Py_hashtable_foreach(
     void *user_data);
 
 PyAPI_FUNC(size_t) _Py_hashtable_size(const _Py_hashtable_t *ht);
+PyAPI_FUNC(size_t) _Py_hashtable_len(const _Py_hashtable_t *ht);
 
 /* Add a new entry to the hash. The key must not be present in the hash table.
    Return 0 on success, -1 on memory error. */
index eb6b66b335f4bdb7528265dd9cadc7b5b6bda40e..0ec86ee6c50ca3d8234f49f1cfc2bc212ced6b72 100644 (file)
@@ -249,6 +249,7 @@ typedef struct pyruntimestate {
     struct _types_runtime_state types;
 
     /* All the objects that are shared by the runtime's interpreters. */
+    struct _Py_cached_objects cached_objects;
     struct _Py_static_objects static_objects;
 
     /* The following fields are here to avoid allocation during init.
index e72e7422c7207edd11984d07661e4e91831432ee..840e41c59ca7376eb335a75606ce5e5de114edf5 100644 (file)
@@ -214,6 +214,7 @@ extern PyTypeObject _PyExc_MemoryError;
             .kind = 1, \
             .compact = 1, \
             .ascii = (ASCII), \
+            .statically_allocated = 1, \
         }, \
     }
 #define _PyASCIIObject_INIT(LITERAL) \
index 7861fa26c8007e76a0c42527853e398cc06f6d87..78ed4bbaad4eb67a95cfc522b9d0ef92a7615ff6 100644 (file)
@@ -14,6 +14,7 @@ from test.support import os_helper
 from test.support.script_helper import assert_python_ok, assert_python_failure
 from test.support import threading_helper
 from test.support import import_helper
+from test.support import interpreters
 import textwrap
 import unittest
 import warnings
@@ -699,6 +700,35 @@ class SysModuleTest(unittest.TestCase):
 
         self.assertRaises(TypeError, sys.intern, S("abc"))
 
+    def test_subinterp_intern_dynamically_allocated(self):
+        global INTERN_NUMRUNS
+        INTERN_NUMRUNS += 1
+        s = "never interned before" + str(INTERN_NUMRUNS)
+        t = sys.intern(s)
+        self.assertIs(t, s)
+
+        interp = interpreters.create()
+        interp.run(textwrap.dedent(f'''
+            import sys
+            t = sys.intern({s!r})
+            assert id(t) != {id(s)}, (id(t), {id(s)})
+            assert id(t) != {id(t)}, (id(t), {id(t)})
+            '''))
+
+    def test_subinterp_intern_statically_allocated(self):
+        # See Tools/build/generate_global_objects.py for the list
+        # of strings that are always statically allocated.
+        s = '__init__'
+        t = sys.intern(s)
+
+        print('------------------------')
+        interp = interpreters.create()
+        interp.run(textwrap.dedent(f'''
+            import sys
+            t = sys.intern({s!r})
+            assert id(t) == {id(t)}, (id(t), {id(t)})
+            '''))
+
     def test_sys_flags(self):
         self.assertTrue(sys.flags)
         attrs = ("debug",
diff --git a/Misc/NEWS.d/next/Core and Builtins/2023-07-25-15-29-26.gh-issue-106931.kKU1le.rst b/Misc/NEWS.d/next/Core and Builtins/2023-07-25-15-29-26.gh-issue-106931.kKU1le.rst
new file mode 100644 (file)
index 0000000..e0def53
--- /dev/null
@@ -0,0 +1,3 @@
+Statically allocated string objects are now interned globally instead of
+per-interpreter.  This fixes a situation where such a string would only be
+interned in a single interpreter. Normal string objects are unaffected.
index fe2660c6ce60584deffc5842946cc4e4732c1886..cc979b2ef28d76e13451110dc43c3c46da3de53c 100644 (file)
@@ -236,15 +236,54 @@ static inline PyObject *get_interned_dict(PyInterpreterState *interp)
     return _Py_INTERP_CACHED_OBJECT(interp, interned_strings);
 }
 
+#define INTERNED_STRINGS _PyRuntime.cached_objects.interned_strings
+
 Py_ssize_t
 _PyUnicode_InternedSize(void)
 {
-    return PyObject_Length(get_interned_dict(_PyInterpreterState_GET()));
+    PyObject *dict = get_interned_dict(_PyInterpreterState_GET());
+    return _Py_hashtable_len(INTERNED_STRINGS) + PyDict_GET_SIZE(dict);
+}
+
+static Py_hash_t unicode_hash(PyObject *);
+static int unicode_compare_eq(PyObject *, PyObject *);
+
+static Py_uhash_t
+hashtable_unicode_hash(const void *key)
+{
+    return unicode_hash((PyObject *)key);
+}
+
+static int
+hashtable_unicode_compare(const void *key1, const void *key2)
+{
+    PyObject *obj1 = (PyObject *)key1;
+    PyObject *obj2 = (PyObject *)key2;
+    if (obj1 != NULL && obj2 != NULL) {
+        return unicode_compare_eq(obj1, obj2);
+    }
+    else {
+        return obj1 == obj2;
+    }
 }
 
 static int
 init_interned_dict(PyInterpreterState *interp)
 {
+    if (_Py_IsMainInterpreter(interp)) {
+        assert(INTERNED_STRINGS == NULL);
+        _Py_hashtable_allocator_t hashtable_alloc = {PyMem_RawMalloc, PyMem_RawFree};
+        INTERNED_STRINGS = _Py_hashtable_new_full(
+            hashtable_unicode_hash,
+            hashtable_unicode_compare,
+            NULL,
+            NULL,
+            &hashtable_alloc
+        );
+        if (INTERNED_STRINGS == NULL) {
+            return -1;
+        }
+    }
     assert(get_interned_dict(interp) == NULL);
     PyObject *interned = interned = PyDict_New();
     if (interned == NULL) {
@@ -263,6 +302,10 @@ clear_interned_dict(PyInterpreterState *interp)
         Py_DECREF(interned);
         _Py_INTERP_CACHED_OBJECT(interp, interned_strings) = NULL;
     }
+    if (_Py_IsMainInterpreter(interp) && INTERNED_STRINGS != NULL) {
+        _Py_hashtable_destroy(INTERNED_STRINGS);
+        INTERNED_STRINGS = NULL;
+    }
 }
 
 #define _Py_RETURN_UNICODE_EMPTY()   \
@@ -1223,6 +1266,7 @@ PyUnicode_New(Py_ssize_t size, Py_UCS4 maxchar)
     _PyUnicode_STATE(unicode).kind = kind;
     _PyUnicode_STATE(unicode).compact = 1;
     _PyUnicode_STATE(unicode).ascii = is_ascii;
+    _PyUnicode_STATE(unicode).statically_allocated = 0;
     if (is_ascii) {
         ((char*)data)[size] = 0;
     }
@@ -1553,7 +1597,9 @@ unicode_dealloc(PyObject *unicode)
      * we accidentally decref an immortal string out of existence. Since
      * the string is an immortal object, just re-set the reference count.
      */
-    if (PyUnicode_CHECK_INTERNED(unicode)) {
+    if (PyUnicode_CHECK_INTERNED(unicode)
+        || _PyUnicode_STATE(unicode).statically_allocated)
+    {
         _Py_SetImmortal(unicode);
         return;
     }
@@ -14503,6 +14549,7 @@ unicode_subtype_new(PyTypeObject *type, PyObject *unicode)
     _PyUnicode_STATE(self).kind = kind;
     _PyUnicode_STATE(self).compact = 0;
     _PyUnicode_STATE(self).ascii = _PyUnicode_STATE(unicode).ascii;
+    _PyUnicode_STATE(self).statically_allocated = 0;
     _PyUnicode_UTF8_LENGTH(self) = 0;
     _PyUnicode_UTF8(self) = NULL;
     _PyUnicode_DATA_ANY(self) = NULL;
@@ -14726,6 +14773,23 @@ _PyUnicode_InternInPlace(PyInterpreterState *interp, PyObject **p)
         return;
     }
 
+    /* Look in the global cache first. */
+    PyObject *r = (PyObject *)_Py_hashtable_get(INTERNED_STRINGS, s);
+    if (r != NULL && r != s) {
+        Py_SETREF(*p, Py_NewRef(r));
+        return;
+    }
+
+    /* Handle statically allocated strings. */
+    if (_PyUnicode_STATE(s).statically_allocated) {
+        assert(_Py_IsImmortal(s));
+        if (_Py_hashtable_set(INTERNED_STRINGS, s, s) == 0) {
+            _PyUnicode_STATE(*p).interned = SSTATE_INTERNED_IMMORTAL_STATIC;
+        }
+        return;
+    }
+
+    /* Look in the per-interpreter cache. */
     PyObject *interned = get_interned_dict(interp);
     assert(interned != NULL);
 
@@ -14741,9 +14805,11 @@ _PyUnicode_InternInPlace(PyInterpreterState *interp, PyObject **p)
     }
 
     if (_Py_IsImmortal(s)) {
+        // XXX Restrict this to the main interpreter?
         _PyUnicode_STATE(*p).interned = SSTATE_INTERNED_IMMORTAL_STATIC;
-       return;
+        return;
     }
+
 #ifdef Py_REF_DEBUG
     /* The reference count value excluding the 2 references from the
        interned dictionary should be excluded from the RefTotal. The
index 0f07cd20b1a95e61bb9b4a427053b39f977a323d..4e22a1a5509eb50ea6bf2a950e784e8cc8050660 100644 (file)
@@ -129,6 +129,13 @@ _Py_hashtable_size(const _Py_hashtable_t *ht)
 }
 
 
+size_t
+_Py_hashtable_len(const _Py_hashtable_t *ht)
+{
+    return ht->nentries;
+}
+
+
 _Py_hashtable_entry_t *
 _Py_hashtable_get_entry_generic(_Py_hashtable_t *ht, const void *key)
 {
index b084d3e457f7827899bb49b8b7629c114aa0142e..a11fe6a62811ab01e4e09fd25c90d8b0597389d2 100644 (file)
@@ -208,6 +208,7 @@ class Printer:
                         self.write(".kind = 1,")
                         self.write(".compact = 1,")
                         self.write(".ascii = 1,")
+                        self.write(".statically_allocated = 1,")
                 self.write(f"._data = {make_string_literal(s.encode('ascii'))},")
                 return f"& {name}._ascii.ob_base"
             else:
@@ -220,6 +221,7 @@ class Printer:
                             self.write(f".kind = {kind},")
                             self.write(".compact = 1,")
                             self.write(".ascii = 0,")
+                            self.write(".statically_allocated = 1,")
                     utf8 = s.encode('utf-8')
                     self.write(f'.utf8 = {make_string_literal(utf8)},')
                     self.write(f'.utf8_length = {len(utf8)},')