gh-103997: Automatically dedent the argument to "-c" (#103998)

author Jon Crall <erotemic@gmail.com>

Fri, 18 Apr 2025 08:39:30 +0000 (04:39 -0400)

committer GitHub <noreply@github.com>

Fri, 18 Apr 2025 08:39:30 +0000 (17:39 +0900)
author Jon Crall <erotemic@gmail.com>
Fri, 18 Apr 2025 08:39:30 +0000 (04:39 -0400)
committer GitHub <noreply@github.com>
Fri, 18 Apr 2025 08:39:30 +0000 (17:39 +0900)
diff --git a/Doc/using/cmdline.rst b/Doc/using/cmdline.rst

index 9b5c6eb863e56dec2618eb3970d7db4d29ae8ea6..fa7c9cddf9c6d6d63bb862e2e9c45a8f061ca7b0 100644 (file)
--- a/Doc/using/cmdline.rst
+++ b/Doc/using/cmdline.rst
@@ -73,6 +73,9 @@ source.
  
     .. audit-event:: cpython.run_command command cmdoption-c
  
+   .. versionchanged:: next
+      *command* is automatically dedented before execution.
+
  .. option:: -m <module-name>
  
     Search :data:`sys.path` for the named module and execute its contents as
diff --git a/Doc/whatsnew/3.14.rst b/Doc/whatsnew/3.14.rst

index 7d469e83dc27ad4da211e5fe6338c8e4f0d93f73..aaa4702d53df93383d308a4d5d612238d0992d84 100644 (file)
--- a/Doc/whatsnew/3.14.rst
+++ b/Doc/whatsnew/3.14.rst
@@ -474,6 +474,12 @@ Other language changes
    explicitly overridden in the subclass.
    (Contributed by Tomasz Pytel in :gh:`132329`.)
  
+* The command line option :option:`-c` now automatically dedents its code
+  argument before execution. The auto-dedentation behavior mirrors
+  :func:`textwrap.dedent`.
+  (Contributed by Jon Crall and Steven Sun in :gh:`103998`.)
+
+
  .. _whatsnew314-pep765:
  
  PEP 765: Disallow return/break/continue that exit a finally block
diff --git a/Include/internal/pycore_unicodeobject.h b/Include/internal/pycore_unicodeobject.h

index 5fea3247e8f68d30201704c05552b1fb3badb876..c85d53b89accdb6318b5b750f26623d47414a52d 100644 (file)
--- a/Include/internal/pycore_unicodeobject.h
+++ b/Include/internal/pycore_unicodeobject.h
@@ -247,6 +247,12 @@ extern Py_ssize_t _PyUnicode_InsertThousandsGrouping(
      Py_UCS4 *maxchar,
      int forward);
  
+/* Dedent a string.
+   Behaviour is expected to be an exact match of `textwrap.dedent`.
+   Return a new reference on success, NULL with exception set on error.
+   */
+extern PyObject* _PyUnicode_Dedent(PyObject *unicode);
+
  /* --- Misc functions ----------------------------------------------------- */
  
  extern PyObject* _PyUnicode_FormatLong(PyObject *, int, int, int);
diff --git a/Lib/test/test_cmd_line.py b/Lib/test/test_cmd_line.py

index b949b310ac0f5f6f2c901d85df10463448d27327..e1d1d03d4ff698a358f2bfe4127b98e847d1b623 100644 (file)
--- a/Lib/test/test_cmd_line.py
+++ b/Lib/test/test_cmd_line.py
@@ -17,6 +17,8 @@ from test.support.script_helper import (
      spawn_python, kill_python, assert_python_ok, assert_python_failure,
      interpreter_requires_environment
  )
+from textwrap import dedent
+
  
  if not support.has_subprocess_support:
      raise unittest.SkipTest("test module requires subprocess")
@@ -1051,6 +1053,88 @@ class CmdLineTest(unittest.TestCase):
          )
          self.assertEqual(res2int(res), (6000, 6000))
  
+    def test_cmd_dedent(self):
+        # test that -c auto-dedents its arguments
+        test_cases = [
+            (
+                """
+                    print('space-auto-dedent')
+                """,
+                "space-auto-dedent",
+            ),
+            (
+                dedent(
+                    """
+                ^^^print('tab-auto-dedent')
+                """
+                ).replace("^", "\t"),
+                "tab-auto-dedent",
+            ),
+            (
+                dedent(
+                    """
+                ^^if 1:
+                ^^^^print('mixed-auto-dedent-1')
+                ^^print('mixed-auto-dedent-2')
+                """
+                ).replace("^", "\t \t"),
+                "mixed-auto-dedent-1\nmixed-auto-dedent-2",
+            ),
+            (
+                '''
+                    data = """$
+
+                    this data has an empty newline above and a newline with spaces below $
+                                            $
+                    """$
+                    if 1:         $
+                        print(repr(data))$
+                '''.replace(
+                    "$", ""
+                ),
+                # Note: entirely blank lines are normalized to \n, even if they
+                # are part of a data string. This is consistent with
+                # textwrap.dedent behavior, but might not be intuitive.
+                "'\\n\\nthis data has an empty newline above and a newline with spaces below \\n\\n'",
+            ),
+            (
+                '',
+                '',
+            ),
+            (
+                '  \t\n\t\n \t\t\t  \t\t \t\n\t\t \n\n\n\t\t\t   ',
+                '',
+            ),
+        ]
+        for code, expected in test_cases:
+            # Run the auto-dedent case
+            args1 = sys.executable, '-c', code
+            proc1 = subprocess.run(args1, stdout=subprocess.PIPE)
+            self.assertEqual(proc1.returncode, 0, proc1)
+            output1 = proc1.stdout.strip().decode(encoding='utf-8')
+
+            # Manually dedent beforehand, check the result is the same.
+            args2 = sys.executable, '-c', dedent(code)
+            proc2 = subprocess.run(args2, stdout=subprocess.PIPE)
+            self.assertEqual(proc2.returncode, 0, proc2)
+            output2 = proc2.stdout.strip().decode(encoding='utf-8')
+
+            self.assertEqual(output1, output2)
+            self.assertEqual(output1.replace('\r\n', '\n'), expected)
+
+    def test_cmd_dedent_failcase(self):
+        # Mixing tabs and spaces is not allowed
+        from textwrap import dedent
+        template = dedent(
+            '''
+            -+if 1:
+            +-++ print('will fail')
+            ''')
+        code = template.replace('-', ' ').replace('+', '\t')
+        assert_python_failure('-c', code)
+        code = template.replace('-', '\t').replace('+', ' ')
+        assert_python_failure('-c', code)
+
      def test_cpu_count(self):
          code = "import os; print(os.cpu_count(), os.process_cpu_count())"
          res = assert_python_ok('-X', 'cpu_count=4321', '-c', code)
diff --git a/Misc/NEWS.d/next/Core_and_Builtins/2023-04-29-23-15-38.gh-issue-103997.BS3uVt.rst b/Misc/NEWS.d/next/Core_and_Builtins/2023-04-29-23-15-38.gh-issue-103997.BS3uVt.rst

new file mode 100644 (file)

index 0000000..511ca8f
--- /dev/null
+++ b/Misc/NEWS.d/next/Core_and_Builtins/2023-04-29-23-15-38.gh-issue-103997.BS3uVt.rst
@@ -0,0 +1,4 @@
+String arguments passed to "-c" are now automatically dedented as if by
+:func:`textwrap.dedent`. This allows "python -c" invocations to be indented
+in shell scripts without causing indentation errors. (Patch by Jon Crall and
+Steven Sun)
diff --git a/Modules/main.c b/Modules/main.c

index c2b7bfde2abd7c2c7949eadf30caf25bf3dfba7f..ea1239ecc57f00eae3150a9309d584bbf00b5400 100644 (file)
--- a/Modules/main.c
+++ b/Modules/main.c
@@ -11,6 +11,7 @@
  #include "pycore_pylifecycle.h"   // _Py_PreInitializeFromPyArgv()
  #include "pycore_pystate.h"       // _PyInterpreterState_GET()
  #include "pycore_pythonrun.h"     // _PyRun_AnyFileObject()
+#include "pycore_unicodeobject.h" // _PyUnicode_Dedent()
  
  /* Includes for exit_sigint() */
  #include <stdio.h>                // perror()
@@ -244,6 +245,11 @@ pymain_run_command(wchar_t *command)
          return pymain_exit_err_print();
      }
  
+    Py_SETREF(unicode, _PyUnicode_Dedent(unicode));
+    if (unicode == NULL) {
+        goto error;
+    }
+
      bytes = PyUnicode_AsUTF8String(unicode);
      Py_DECREF(unicode);
      if (bytes == NULL) {
diff --git a/Objects/unicodeobject.c b/Objects/unicodeobject.c

index 7c735685e89389298a3cd2aed55997a030e67810..e01a10fc19e904a8406d2ceb78b952e0eee99226 100644 (file)
--- a/Objects/unicodeobject.c
+++ b/Objects/unicodeobject.c
@@ -14270,6 +14270,163 @@ unicode_getnewargs(PyObject *v, PyObject *Py_UNUSED(ignored))
      return Py_BuildValue("(N)", copy);
  }
  
+/*
+This function searchs the longest common leading whitespace
+of all lines in the [src, end).
+It returns the length of the common leading whitespace and sets `output` to
+point to the beginning of the common leading whitespace if length > 0.
+*/
+static Py_ssize_t
+search_longest_common_leading_whitespace(
+    const char *const src,
+    const char *const end,
+    const char **output)
+{
+    // [_start, _start + _len)
+    // describes the current longest common leading whitespace
+    const char *_start = NULL;
+    Py_ssize_t _len = 0;
+
+    for (const char *iter = src; iter < end; ++iter) {
+        const char *line_start = iter;
+        const char *leading_whitespace_end = NULL;
+
+        // scan the whole line
+        while (iter < end && *iter != '\n') {
+            if (!leading_whitespace_end && *iter != ' ' && *iter != '\t') {
+                /* `iter` points to the first non-whitespace character
+                   in this line */
+                if (iter == line_start) {
+                    // some line has no indent, fast exit!
+                    return 0;
+                }
+                leading_whitespace_end = iter;
+            }
+            ++iter;
+        }
+
+        // if this line has all white space, skip it
+        if (!leading_whitespace_end) {
+            continue;
+        }
+
+        if (!_start) {
+            // update the first leading whitespace
+            _start = line_start;
+            _len = leading_whitespace_end - line_start;
+            assert(_len > 0);
+        }
+        else {
+            /* We then compare with the current longest leading whitespace.
+
+               [line_start, leading_whitespace_end) is the leading
+               whitespace of this line,
+
+               [_start, _start + _len) is the leading whitespace of the
+               current longest leading whitespace. */
+            Py_ssize_t new_len = 0;
+            const char *_iter = _start, *line_iter = line_start;
+
+            while (_iter < _start + _len && line_iter < leading_whitespace_end
+                   && *_iter == *line_iter)
+            {
+                ++_iter;
+                ++line_iter;
+                ++new_len;
+            }
+
+            _len = new_len;
+            if (_len == 0) {
+                // No common things now, fast exit!
+                return 0;
+            }
+        }
+    }
+
+    assert(_len >= 0);
+    if (_len > 0) {
+        *output = _start;
+    }
+    return _len;
+}
+
+/* Dedent a string.
+   Behaviour is expected to be an exact match of `textwrap.dedent`.
+   Return a new reference on success, NULL with exception set on error.
+   */
+PyObject *
+_PyUnicode_Dedent(PyObject *unicode)
+{
+    Py_ssize_t src_len = 0;
+    const char *src = PyUnicode_AsUTF8AndSize(unicode, &src_len);
+    if (!src) {
+        return NULL;
+    }
+    assert(src_len >= 0);
+    if (src_len == 0) {
+        return Py_NewRef(unicode);
+    }
+
+    const char *const end = src + src_len;
+
+    // [whitespace_start, whitespace_start + whitespace_len)
+    // describes the current longest common leading whitespace
+    const char *whitespace_start = NULL;
+    Py_ssize_t whitespace_len = search_longest_common_leading_whitespace(
+        src, end, &whitespace_start);
+
+    if (whitespace_len == 0) {
+        return Py_NewRef(unicode);
+    }
+
+    // now we should trigger a dedent
+    char *dest = PyMem_Malloc(src_len);
+    if (!dest) {
+        PyErr_NoMemory();
+        return NULL;
+    }
+    char *dest_iter = dest;
+
+    for (const char *iter = src; iter < end; ++iter) {
+        const char *line_start = iter;
+        bool in_leading_space = true;
+
+        // iterate over a line to find the end of a line
+        while (iter < end && *iter != '\n') {
+            if (in_leading_space && *iter != ' ' && *iter != '\t') {
+                in_leading_space = false;
+            }
+            ++iter;
+        }
+
+        // invariant: *iter == '\n' or iter == end
+        bool append_newline = iter < end;
+
+        // if this line has all white space, write '\n' and continue
+        if (in_leading_space && append_newline) {
+            *dest_iter++ = '\n';
+            continue;
+        }
+
+        /* copy [new_line_start + whitespace_len, iter) to buffer, then
+            conditionally append '\n' */
+
+        Py_ssize_t new_line_len = iter - line_start - whitespace_len;
+        assert(new_line_len >= 0);
+        memcpy(dest_iter, line_start + whitespace_len, new_line_len);
+
+        dest_iter += new_line_len;
+
+        if (append_newline) {
+            *dest_iter++ = '\n';
+        }
+    }
+
+    PyObject *res = PyUnicode_FromStringAndSize(dest, dest_iter - dest);
+    PyMem_Free(dest);
+    return res;
+}
+
  static PyMethodDef unicode_methods[] = {
      UNICODE_ENCODE_METHODDEF
      UNICODE_REPLACE_METHODDEF
author	Jon Crall <erotemic@gmail.com>
	Fri, 18 Apr 2025 08:39:30 +0000 (04:39 -0400)
committer	GitHub <noreply@github.com>
	Fri, 18 Apr 2025 08:39:30 +0000 (17:39 +0900)
Doc/using/cmdline.rst		patch \| blob \| blame \| history
Doc/whatsnew/3.14.rst		patch \| blob \| blame \| history
Include/internal/pycore_unicodeobject.h		patch \| blob \| blame \| history
Lib/test/test_cmd_line.py		patch \| blob \| blame \| history
Misc/NEWS.d/next/Core_and_Builtins/2023-04-29-23-15-38.gh-issue-103997.BS3uVt.rst	[new file with mode: 0644]	patch \| blob
Modules/main.c		patch \| blob \| blame \| history
Objects/unicodeobject.c		patch \| blob \| blame \| history