gh-81283: compiler: remove indent from docstring (#106411)

author Inada Naoki <songofacandy@gmail.com>

Sat, 15 Jul 2023 10:33:32 +0000 (19:33 +0900)

committer GitHub <noreply@github.com>

Sat, 15 Jul 2023 10:33:32 +0000 (19:33 +0900)
author Inada Naoki <songofacandy@gmail.com>
Sat, 15 Jul 2023 10:33:32 +0000 (19:33 +0900)
committer GitHub <noreply@github.com>
Sat, 15 Jul 2023 10:33:32 +0000 (19:33 +0900)
diff --git a/Doc/whatsnew/3.13.rst b/Doc/whatsnew/3.13.rst

index 06fcaf4608cdcb54328fbaea27c5c0cb37f28c3c..161d5fb1c59a303293c88bb6cfdd9d1856963a30 100644 (file)
--- a/Doc/whatsnew/3.13.rst
+++ b/Doc/whatsnew/3.13.rst
@@ -79,6 +79,13 @@ Other Language Changes
  * Allow the *count* argument of :meth:`str.replace` to be a keyword.
    (Contributed by Hugo van Kemenade in :gh:`106487`.)
  
+* Compiler now strip indents from docstrings.
+  This will reduce the size of :term:`bytecode cache <bytecode>` (e.g. ``.pyc`` file).
+  For example, cache file size for ``sqlalchemy.orm.session`` in SQLAlchemy 2.0
+  is reduced by about 5%.
+  This change will affect tools using docstrings, like :mod:`doctest`.
+  (Contributed by Inada Naoki in :gh:`81283`.)
+
  New Modules
  ===========
  
diff --git a/Include/internal/pycore_compile.h b/Include/internal/pycore_compile.h

index e204d4d2457a163bc40974b56c7ea6fe5f58f9cf..beb37cced06dbae78235937b727c58399778eb13 100644 (file)
--- a/Include/internal/pycore_compile.h
+++ b/Include/internal/pycore_compile.h
@@ -91,6 +91,8 @@ int _PyCompile_ConstCacheMergeOne(PyObject *const_cache, PyObject **obj);
  
  /* Access compiler internals for unit testing */
  
+PyAPI_FUNC(PyObject*) _PyCompile_CleanDoc(PyObject *doc);
+
  PyAPI_FUNC(PyObject*) _PyCompile_CodeGen(
          PyObject *ast,
          PyObject *filename,
diff --git a/Lib/inspect.py b/Lib/inspect.py

index a550202bb0d49bf539d649ca865820348dafb032..15f94a194856aca521df356ee5bcb7f0fb5ed47b 100644 (file)
--- a/Lib/inspect.py
+++ b/Lib/inspect.py
@@ -881,29 +881,28 @@ def cleandoc(doc):
  
      Any whitespace that can be uniformly removed from the second line
      onwards is removed."""
-    try:
-        lines = doc.expandtabs().split('\n')
-    except UnicodeError:
-        return None
-    else:
-        # Find minimum indentation of any non-blank lines after first line.
-        margin = sys.maxsize
-        for line in lines[1:]:
-            content = len(line.lstrip())
-            if content:
-                indent = len(line) - content
-                margin = min(margin, indent)
-        # Remove indentation.
-        if lines:
-            lines[0] = lines[0].lstrip()
-        if margin < sys.maxsize:
-            for i in range(1, len(lines)): lines[i] = lines[i][margin:]
-        # Remove any trailing or leading blank lines.
-        while lines and not lines[-1]:
-            lines.pop()
-        while lines and not lines[0]:
-            lines.pop(0)
-        return '\n'.join(lines)
+    lines = doc.expandtabs().split('\n')
+
+    # Find minimum indentation of any non-blank lines after first line.
+    margin = sys.maxsize
+    for line in lines[1:]:
+        content = len(line.lstrip(' '))
+        if content:
+            indent = len(line) - content
+            margin = min(margin, indent)
+    # Remove indentation.
+    if lines:
+        lines[0] = lines[0].lstrip(' ')
+    if margin < sys.maxsize:
+        for i in range(1, len(lines)):
+            lines[i] = lines[i][margin:]
+    # Remove any trailing or leading blank lines.
+    while lines and not lines[-1]:
+        lines.pop()
+    while lines and not lines[0]:
+        lines.pop(0)
+    return '\n'.join(lines)
+
  
  def getfile(object):
      """Work out which source or compiled file an object was defined in."""
diff --git a/Lib/test/test_doctest.py b/Lib/test/test_doctest.py

index 542fcdb5cf6f663267eaa22f85dee2ef13eb7e28..bea52c6de7ec6de528d175846004723c919addde 100644 (file)
--- a/Lib/test/test_doctest.py
+++ b/Lib/test/test_doctest.py
@@ -1287,14 +1287,14 @@ The NORMALIZE_WHITESPACE flag causes all sequences of whitespace to be
  treated as equal:
  
      >>> def f(x):
-    ...     '>>> print(1, 2, 3)\n  1   2\n 3'
+    ...     '\n>>> print(1, 2, 3)\n  1   2\n 3'
  
      >>> # Without the flag:
      >>> test = doctest.DocTestFinder().find(f)[0]
      >>> doctest.DocTestRunner(verbose=False).run(test)
      ... # doctest: +ELLIPSIS
      **********************************************************************
-    File ..., line 2, in f
+    File ..., line 3, in f
      Failed example:
          print(1, 2, 3)
      Expected:
diff --git a/Lib/test/test_inspect.py b/Lib/test/test_inspect.py

index d89953ab60f02205ffc5d498fcafeb0b8ab0aa38..64afeec351b35326ee476bbdbfd23348e2952623 100644 (file)
--- a/Lib/test/test_inspect.py
+++ b/Lib/test/test_inspect.py
@@ -596,9 +596,40 @@ class TestRetrievingSourceCode(GetSourceBase):
          self.assertEqual(finddoc(int.from_bytes), int.from_bytes.__doc__)
          self.assertEqual(finddoc(int.real), int.real.__doc__)
  
+    cleandoc_testdata = [
+        # first line should have different margin
+        (' An\n  indented\n   docstring.', 'An\nindented\n docstring.'),
+        # trailing whitespace are not removed.
+        (' An \n   \n  indented \n   docstring. ',
+         'An \n \nindented \n docstring. '),
+        # NUL is not termination.
+        ('doc\0string\n\n  second\0line\n  third\0line\0',
+         'doc\0string\n\nsecond\0line\nthird\0line\0'),
+        # first line is lstrip()-ped. other lines are kept when no margin.[w:
+        ('   ', ''),
+        # compiler.cleandoc() doesn't strip leading/trailing newlines
+        # to keep maximum backward compatibility.
+        # inspect.cleandoc() removes them.
+        ('\n\n\n  first paragraph\n\n   second paragraph\n\n',
+         '\n\n\nfirst paragraph\n\n second paragraph\n\n'),
+        ('   \n \n  \n   ', '\n \n  \n   '),
+    ]
+
      def test_cleandoc(self):
-        self.assertEqual(inspect.cleandoc('An\n    indented\n    docstring.'),
-                         'An\nindented\ndocstring.')
+        func = inspect.cleandoc
+        for i, (input, expected) in enumerate(self.cleandoc_testdata):
+            # only inspect.cleandoc() strip \n
+            expected = expected.strip('\n')
+            with self.subTest(i=i):
+                self.assertEqual(func(input), expected)
+
+    @cpython_only
+    def test_c_cleandoc(self):
+        import _testinternalcapi
+        func = _testinternalcapi.compiler_cleandoc
+        for i, (input, expected) in enumerate(self.cleandoc_testdata):
+            with self.subTest(i=i):
+                self.assertEqual(func(input), expected)
  
      def test_getcomments(self):
          self.assertEqual(inspect.getcomments(mod), '# line 1\n')
diff --git a/Misc/NEWS.d/next/Core and Builtins/2023-07-04-20-42-54.gh-issue-81283.hfh_MD.rst b/Misc/NEWS.d/next/Core and Builtins/2023-07-04-20-42-54.gh-issue-81283.hfh_MD.rst

new file mode 100644 (file)

index 0000000..f673c66
--- /dev/null
+++ b/Misc/NEWS.d/next/Core and Builtins/2023-07-04-20-42-54.gh-issue-81283.hfh_MD.rst
@@ -0,0 +1,3 @@
+Compiler now strips indents from docstrings. It reduces ``pyc`` file size 5%
+when the module is heavily documented. This change affects to ``__doc__`` so
+tools like doctest will be affected.
diff --git a/Modules/_testinternalcapi.c b/Modules/_testinternalcapi.c

index 7745dd5abc22f01ed5ca393aa29647a76c9c3eeb..271ad6cfcaee32327588f8dda7064bd8e4065034 100644 (file)
--- a/Modules/_testinternalcapi.c
+++ b/Modules/_testinternalcapi.c
@@ -15,7 +15,7 @@
  #include "pycore_atomic_funcs.h" // _Py_atomic_int_get()
  #include "pycore_bitutils.h"     // _Py_bswap32()
  #include "pycore_bytesobject.h"  // _PyBytes_Find()
-#include "pycore_compile.h"      // _PyCompile_CodeGen, _PyCompile_OptimizeCfg, _PyCompile_Assemble
+#include "pycore_compile.h"      // _PyCompile_CodeGen, _PyCompile_OptimizeCfg, _PyCompile_Assemble, _PyCompile_CleanDoc
  #include "pycore_ceval.h"        // _PyEval_AddPendingCall
  #include "pycore_fileutils.h"    // _Py_normpath
  #include "pycore_frame.h"        // _PyInterpreterFrame
@@ -704,6 +704,23 @@ set_eval_frame_record(PyObject *self, PyObject *list)
      Py_RETURN_NONE;
  }
  
+/*[clinic input]
+
+_testinternalcapi.compiler_cleandoc -> object
+
+    doc: unicode
+
+C implementation of inspect.cleandoc().
+[clinic start generated code]*/
+
+static PyObject *
+_testinternalcapi_compiler_cleandoc_impl(PyObject *module, PyObject *doc)
+/*[clinic end generated code: output=2dd203a80feff5bc input=2de03fab931d9cdc]*/
+{
+    return _PyCompile_CleanDoc(doc);
+}
+
+
  /*[clinic input]
  
  _testinternalcapi.compiler_codegen -> object
@@ -1448,6 +1465,7 @@ static PyMethodDef module_functions[] = {
      {"DecodeLocaleEx", decode_locale_ex, METH_VARARGS},
      {"set_eval_frame_default", set_eval_frame_default, METH_NOARGS, NULL},
      {"set_eval_frame_record", set_eval_frame_record, METH_O, NULL},
+    _TESTINTERNALCAPI_COMPILER_CLEANDOC_METHODDEF
      _TESTINTERNALCAPI_COMPILER_CODEGEN_METHODDEF
      _TESTINTERNALCAPI_OPTIMIZE_CFG_METHODDEF
      _TESTINTERNALCAPI_ASSEMBLE_CODE_OBJECT_METHODDEF
diff --git a/Modules/clinic/_testinternalcapi.c.h b/Modules/clinic/_testinternalcapi.c.h

index f5124125874503e7f5cd34227ffdf1b29c9894aa..9419dcd751a0e9b0a570f5a93dc2f22a8879d565 100644 (file)
--- a/Modules/clinic/_testinternalcapi.c.h
+++ b/Modules/clinic/_testinternalcapi.c.h
@@ -8,6 +8,65 @@ preserve
  #endif
  
  
+PyDoc_STRVAR(_testinternalcapi_compiler_cleandoc__doc__,
+"compiler_cleandoc($module, /, doc)\n"
+"--\n"
+"\n"
+"C implementation of inspect.cleandoc().");
+
+#define _TESTINTERNALCAPI_COMPILER_CLEANDOC_METHODDEF    \
+    {"compiler_cleandoc", _PyCFunction_CAST(_testinternalcapi_compiler_cleandoc), METH_FASTCALL|METH_KEYWORDS, _testinternalcapi_compiler_cleandoc__doc__},
+
+static PyObject *
+_testinternalcapi_compiler_cleandoc_impl(PyObject *module, PyObject *doc);
+
+static PyObject *
+_testinternalcapi_compiler_cleandoc(PyObject *module, PyObject *const *args, Py_ssize_t nargs, PyObject *kwnames)
+{
+    PyObject *return_value = NULL;
+    #if defined(Py_BUILD_CORE) && !defined(Py_BUILD_CORE_MODULE)
+
+    #define NUM_KEYWORDS 1
+    static struct {
+        PyGC_Head _this_is_not_used;
+        PyObject_VAR_HEAD
+        PyObject *ob_item[NUM_KEYWORDS];
+    } _kwtuple = {
+        .ob_base = PyVarObject_HEAD_INIT(&PyTuple_Type, NUM_KEYWORDS)
+        .ob_item = { &_Py_ID(doc), },
+    };
+    #undef NUM_KEYWORDS
+    #define KWTUPLE (&_kwtuple.ob_base.ob_base)
+
+    #else  // !Py_BUILD_CORE
+    #  define KWTUPLE NULL
+    #endif  // !Py_BUILD_CORE
+
+    static const char * const _keywords[] = {"doc", NULL};
+    static _PyArg_Parser _parser = {
+        .keywords = _keywords,
+        .fname = "compiler_cleandoc",
+        .kwtuple = KWTUPLE,
+    };
+    #undef KWTUPLE
+    PyObject *argsbuf[1];
+    PyObject *doc;
+
+    args = _PyArg_UnpackKeywords(args, nargs, NULL, kwnames, &_parser, 1, 1, 0, argsbuf);
+    if (!args) {
+        goto exit;
+    }
+    if (!PyUnicode_Check(args[0])) {
+        _PyArg_BadArgument("compiler_cleandoc", "argument 'doc'", "str", args[0]);
+        goto exit;
+    }
+    doc = args[0];
+    return_value = _testinternalcapi_compiler_cleandoc_impl(module, doc);
+
+exit:
+    return return_value;
+}
+
  PyDoc_STRVAR(_testinternalcapi_compiler_codegen__doc__,
  "compiler_codegen($module, /, ast, filename, optimize, compile_mode=0)\n"
  "--\n"
@@ -206,4 +265,4 @@ _testinternalcapi_assemble_code_object(PyObject *module, PyObject *const *args,
  exit:
      return return_value;
  }
-/*[clinic end generated code: output=2965f1578b986218 input=a9049054013a1b77]*/
+/*[clinic end generated code: output=811d50772c8f285a input=a9049054013a1b77]*/
diff --git a/Python/compile.c b/Python/compile.c

index 9e86e06777ffa47a18828affd259273731c15b07..b80f7c01bcd90e735b551b6a562bfa470548766f 100644 (file)
--- a/Python/compile.c
+++ b/Python/compile.c
@@ -1704,10 +1704,16 @@ compiler_body(struct compiler *c, location loc, asdl_stmt_seq *stmts)
      if (c->c_optimize < 2) {
          docstring = _PyAST_GetDocString(stmts);
          if (docstring) {
+            PyObject *cleandoc = _PyCompile_CleanDoc(docstring);
+            if (cleandoc == NULL) {
+                return ERROR;
+            }
              i = 1;
              st = (stmt_ty)asdl_seq_GET(stmts, 0);
              assert(st->kind == Expr_kind);
-            VISIT(c, expr, st->v.Expr.value);
+            location loc = LOC(st->v.Expr.value);
+            ADDOP_LOAD_CONST(c, loc, cleandoc);
+            Py_DECREF(cleandoc);
              RETURN_IF_ERROR(compiler_nameop(c, NO_LOCATION, &_Py_ID(__doc__), Store));
          }
      }
@@ -2252,11 +2258,19 @@ compiler_function_body(struct compiler *c, stmt_ty s, int is_async, Py_ssize_t f
      /* if not -OO mode, add docstring */
      if (c->c_optimize < 2) {
          docstring = _PyAST_GetDocString(body);
+        if (docstring) {
+            docstring = _PyCompile_CleanDoc(docstring);
+            if (docstring == NULL) {
+                compiler_exit_scope(c);
+                return ERROR;
+            }
+        }
      }
      if (compiler_add_const(c->c_const_cache, c->u, docstring ? docstring : Py_None) < 0) {
          compiler_exit_scope(c);
          return ERROR;
      }
+    Py_XDECREF(docstring);
  
      c->u->u_metadata.u_argcount = asdl_seq_LEN(args->args);
      c->u->u_metadata.u_posonlyargcount = asdl_seq_LEN(args->posonlyargs);
@@ -7967,6 +7981,89 @@ error:
      return NULL;
  }
  
+// C implementation of inspect.cleandoc()
+//
+// Difference from inspect.cleandoc():
+// - Do not remove leading and trailing blank lines to keep lineno.
+PyObject *
+_PyCompile_CleanDoc(PyObject *doc)
+{
+    doc = PyObject_CallMethod(doc, "expandtabs", NULL);
+    if (doc == NULL) {
+        return NULL;
+    }
+
+    Py_ssize_t doc_size;
+    const char *doc_utf8 = PyUnicode_AsUTF8AndSize(doc, &doc_size);
+    if (doc_utf8 == NULL) {
+        Py_DECREF(doc);
+        return NULL;
+    }
+    const char *p = doc_utf8;
+    const char *pend = p + doc_size;
+
+    // First pass: find minimum indentation of any non-blank lines
+    // after first line.
+    while (p < pend && *p++ != '\n') {
+    }
+
+    Py_ssize_t margin = PY_SSIZE_T_MAX;
+    while (p < pend) {
+        const char *s = p;
+        while (*p == ' ') p++;
+        if (p < pend && *p != '\n') {
+            margin = Py_MIN(margin, p - s);
+        }
+        while (p < pend && *p++ != '\n') {
+        }
+    }
+    if (margin == PY_SSIZE_T_MAX) {
+        margin = 0;
+    }
+
+    // Second pass: write cleandoc into buff.
+
+    // copy first line without leading spaces.
+    p = doc_utf8;
+    while (*p == ' ') {
+        p++;
+    }
+    if (p == doc_utf8 && margin == 0 ) {
+        // doc is already clean.
+        return doc;
+    }
+
+    char *buff = PyMem_Malloc(doc_size);
+    char *w = buff;
+
+    while (p < pend) {
+        int ch = *w++ = *p++;
+        if (ch == '\n') {
+            break;
+        }
+    }
+
+    // copy subsequent lines without margin.
+    while (p < pend) {
+        for (Py_ssize_t i = 0; i < margin; i++, p++) {
+            if (*p != ' ') {
+                assert(*p == '\n' || *p == '\0');
+                break;
+            }
+        }
+        while (p < pend) {
+            int ch = *w++ = *p++;
+            if (ch == '\n') {
+                break;
+            }
+        }
+    }
+
+    Py_DECREF(doc);
+    return PyUnicode_FromStringAndSize(buff, w - buff);
+}
+
+
  PyObject *
  _PyCompile_CodeGen(PyObject *ast, PyObject *filename, PyCompilerFlags *pflags,
                     int optimize, int compile_mode)
author	Inada Naoki <songofacandy@gmail.com>
	Sat, 15 Jul 2023 10:33:32 +0000 (19:33 +0900)
committer	GitHub <noreply@github.com>
	Sat, 15 Jul 2023 10:33:32 +0000 (19:33 +0900)
Doc/whatsnew/3.13.rst		patch \| blob \| blame \| history
Include/internal/pycore_compile.h		patch \| blob \| blame \| history
Lib/inspect.py		patch \| blob \| blame \| history
Lib/test/test_doctest.py		patch \| blob \| blame \| history
Lib/test/test_inspect.py		patch \| blob \| blame \| history
Misc/NEWS.d/next/Core and Builtins/2023-07-04-20-42-54.gh-issue-81283.hfh_MD.rst	[new file with mode: 0644]	patch \| blob
Modules/_testinternalcapi.c		patch \| blob \| blame \| history
Modules/clinic/_testinternalcapi.c.h		patch \| blob \| blame \| history
Python/compile.c		patch \| blob \| blame \| history