tree-parloops: Enable runtime thread detection with -ftree-parallelize-loops

author Sebastian Pop <spop@nvidia.com>

Fri, 25 Jul 2025 15:55:03 +0000 (17:55 +0200)

committer Richard Biener <rguenth@gcc.gnu.org>

Wed, 15 Oct 2025 12:57:45 +0000 (14:57 +0200)
author Sebastian Pop <spop@nvidia.com>
Fri, 25 Jul 2025 15:55:03 +0000 (17:55 +0200)
committer Richard Biener <rguenth@gcc.gnu.org>
Wed, 15 Oct 2025 12:57:45 +0000 (14:57 +0200)
diff --git a/gcc/common.opt b/gcc/common.opt

index 6c993a8a6d384fe5bedbf54821a5324927a933ff..9b8fbf6a684538f15357aeb8e375aac57523ebe0 100644 (file)
--- a/gcc/common.opt
+++ b/gcc/common.opt
@@ -3303,6 +3303,10 @@ ftree-parallelize-loops=
  Common Joined RejectNegative UInteger Var(flag_tree_parallelize_loops) Init(1) Optimization
  -ftree-parallelize-loops=<number>      Enable automatic parallelization of loops.
  
+ftree-parallelize-loops
+Common Alias(ftree-parallelize-loops=,2147483647,1)
+Enable automatic parallelization of loops.
+
  ftree-phiprop
  Common Var(flag_tree_phiprop) Init(1) Optimization
  Enable hoisting loads from conditional pointers.
diff --git a/gcc/doc/invoke.texi b/gcc/doc/invoke.texi

index ddbcbf0dddeb78bac12ba3f73101a4a70e442f2c..3f5398646bf3165393fafd974511f58593b13654 100644 (file)
--- a/gcc/doc/invoke.texi
+++ b/gcc/doc/invoke.texi
@@ -659,7 +659,7 @@ Objective-C and Objective-C++ Dialects}.
  -ftree-phiprop  -ftree-loop-distribution  -ftree-loop-distribute-patterns
  -ftree-loop-ivcanon  -ftree-loop-linear  -ftree-loop-optimize
  -ftree-loop-vectorize
--ftree-parallelize-loops=@var{n}  -ftree-pre  -ftree-partial-pre  -ftree-pta
+-ftree-parallelize-loops[=@var{n}]  -ftree-pre  -ftree-partial-pre  -ftree-pta
  -ftree-reassoc  -ftree-scev-cprop  -ftree-sink  -ftree-slsr  -ftree-sra
  -ftree-switch-conversion  -ftree-tail-merge
  -ftree-ter  -ftree-vectorize  -ftree-vrp  -ftrivial-auto-var-init
@@ -14691,8 +14691,9 @@ variable merging and induction variable elimination) on trees.
  Enabled by default at @option{-O1} and higher.
  
  @opindex ftree-parallelize-loops
-@item -ftree-parallelize-loops=n
-Parallelize loops, i.e., split their iteration space to run in n threads.
+@item -ftree-parallelize-loops
+@itemx -ftree-parallelize-loops=@var{n}
+Parallelize loops, i.e., split their iteration space to run in multiple threads.
  This is only possible for loops whose iterations are independent
  and can be arbitrarily reordered.  The optimization is only
  profitable on multiprocessor machines, for loops that are CPU-intensive,
@@ -14700,6 +14701,17 @@ rather than constrained e.g.@: by memory bandwidth.  This option
  implies @option{-pthread}, and thus is only supported on targets
  that have support for @option{-pthread}.
  
+When a positive value @var{n} is specified, the number of threads is fixed
+at compile time and cannot be changed after compilation. The compiler
+generates ``#pragma omp parallel num_threads(@var{n})''.
+
+When used without @code{=@var{n}} (i.e., @option{-ftree-parallelize-loops}),
+the number of threads is determined at program execution time via the
+@env{OMP_NUM_THREADS} environment variable. If @env{OMP_NUM_THREADS} is not
+set, the OpenMP runtime automatically detects the number of available
+processors and uses that value. This enables creating binaries that
+adapt to different hardware configurations without recompilation.
+
  @opindex ftree-pta
  @item -ftree-pta
  Perform function-local points-to analysis on trees.  This flag is
diff --git a/gcc/testsuite/gcc.dg/autopar/runtime-auto.c b/gcc/testsuite/gcc.dg/autopar/runtime-auto.c

new file mode 100644 (file)

index 0000000..c1a3131
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/autopar/runtime-auto.c
@@ -0,0 +1,53 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -ftree-parallelize-loops -fdump-tree-parloops2-details" } */
+
+void abort (void);
+
+#define N 1000
+
+int a[N], b[N], c[N];
+
+void
+test_parallel_loop (void)
+{
+  int i;
+
+  /* This loop should be auto-parallelized when -ftree-parallelize-loops
+     (without =number) is used for runtime thread detection via OMP_NUM_THREADS.  */
+  for (i = 0; i < N; i++)
+    a[i] = b[i] + c[i];
+}
+
+int
+main (void)
+{
+  int i;
+
+  for (i = 0; i < N; i++)
+    {
+      b[i] = i;
+      c[i] = i * 2;
+    }
+
+  test_parallel_loop ();
+
+  for (i = 0; i < N; i++)
+    {
+      if (a[i] != b[i] + c[i])
+       abort ();
+    }
+
+  return 0;
+}
+
+/* Check that the loop is parallelized with runtime thread detection.  */
+/* { dg-final { scan-tree-dump "parallelizing" "parloops2" } } */
+
+/* Check that "#pragma omp parallel" is generated.  */
+/* { dg-final { scan-tree-dump "pragma omp parallel" "parloops2" } } */
+
+/* Check that instead of generating a num_threads(x) clause, the compiler calls
+   "__builtin_omp_get_num_threads" that will set the number of threads at
+   program execution time.  */
+/* { dg-final { scan-tree-dump "__builtin_omp_get_num_threads" "parloops2" } } */
+
diff --git a/gcc/tree-parloops.cc b/gcc/tree-parloops.cc

index 666c6a1f376095b76814c465631f13fa9627f432..736182868dc48b6243eb3dcd768f7f69e175065f 100644 (file)
--- a/gcc/tree-parloops.cc
+++ b/gcc/tree-parloops.cc
@@ -2601,10 +2601,19 @@ create_parallel_loop (class loop *loop, tree loop_fn, tree data,
        gsi = gsi_last_bb (paral_bb);
  
        gcc_checking_assert (n_threads != 0);
-      t = build_omp_clause (loc, OMP_CLAUSE_NUM_THREADS);
-      OMP_CLAUSE_NUM_THREADS_EXPR (t)
-       = build_int_cst (integer_type_node, n_threads);
-      omp_par_stmt = gimple_build_omp_parallel (NULL, t, loop_fn, data);
+      if (n_threads == INT_MAX)
+       /* No hardcoded thread count, let OpenMP runtime decide.  */
+       omp_par_stmt = gimple_build_omp_parallel (NULL, NULL_TREE, loop_fn,
+                                                 data);
+      else
+       {
+         /* Build the OMP_CLAUSE_NUM_THREADS clause only if we have a fixed
+            thread count.  */
+         t = build_omp_clause (loc, OMP_CLAUSE_NUM_THREADS);
+         OMP_CLAUSE_NUM_THREADS_EXPR (t)
+           = build_int_cst (integer_type_node, n_threads);
+         omp_par_stmt = gimple_build_omp_parallel (NULL, t, loop_fn, data);
+       }
        gimple_set_location (omp_par_stmt, loc);
  
        gsi_insert_after (&gsi, omp_par_stmt, GSI_NEW_STMT);
@@ -2812,7 +2821,6 @@ gen_parallel_loop (class loop *loop,
    struct clsn_data clsn_data;
    location_t loc;
    gimple *cond_stmt;
-  unsigned int m_p_thread=2;
  
    /* From
  
@@ -2885,15 +2893,14 @@ gen_parallel_loop (class loop *loop,
  
    if (!oacc_kernels_p)
      {
-      if (loop->inner)
-       m_p_thread=2;
-      else
-       m_p_thread=MIN_PER_THREAD;
-
        gcc_checking_assert (n_threads != 0);
+      /* For runtime thread detection, use a conservative estimate of 2 threads
+        for the many iterations condition check.  */
+      unsigned threads = (n_threads == INT_MAX) ? 2 : n_threads;
+      unsigned m_p_thread = loop->inner ? 2 : MIN_PER_THREAD;
        many_iterations_cond =
         fold_build2 (GE_EXPR, boolean_type_node,
-                    nit, build_int_cst (type, m_p_thread * n_threads - 1));
+                    nit, build_int_cst (type, m_p_thread * threads - 1));
  
        many_iterations_cond
         = fold_build2 (TRUTH_AND_EXPR, boolean_type_node,
@@ -3905,14 +3912,15 @@ parallelize_loops (bool oacc_kernels_p)
        estimated = estimated_loop_iterations_int (loop);
        if (estimated == -1)
         estimated = get_likely_max_loop_iterations_int (loop);
+      /* For runtime thread detection, use an estimate of 2 threads.  */
+      unsigned threads = (n_threads == INT_MAX) ? 2 : n_threads;
+      unsigned m_p_thread = loop->inner ? 2 : MIN_PER_THREAD;
        /* FIXME: Bypass this check as graphite doesn't update the
          count and frequency correctly now.  */
        if (!flag_loop_parallelize_all
           && !oacc_kernels_p
           && ((estimated != -1
-              && (estimated
-                  < ((HOST_WIDE_INT) n_threads
-                     * (loop->inner ? 2 : MIN_PER_THREAD) - 1)))
+              && (estimated < ((HOST_WIDE_INT) threads * m_p_thread - 1)))
               /* Do not bother with loops in cold areas.  */
               || optimize_loop_nest_for_size_p (loop)))
         continue;
author	Sebastian Pop <spop@nvidia.com>
	Fri, 25 Jul 2025 15:55:03 +0000 (17:55 +0200)
committer	Richard Biener <rguenth@gcc.gnu.org>
	Wed, 15 Oct 2025 12:57:45 +0000 (14:57 +0200)
gcc/common.opt		patch \| blob \| blame \| history
gcc/doc/invoke.texi		patch \| blob \| blame \| history
gcc/testsuite/gcc.dg/autopar/runtime-auto.c	[new file with mode: 0644]	patch \| blob
gcc/tree-parloops.cc		patch \| blob \| blame \| history