libgfortran/m4/matmul.m4

   1 `/* Implementation of the MATMUL intrinsic
   2    Copyright (C) 2002-2024 Free Software Foundation, Inc.
   3    Contributed by Paul Brook <paul@nowt.org>
   4
   5 This file is part of the GNU Fortran runtime library (libgfortran).
   6
   7 Libgfortran is free software; you can redistribute it and/or
   8 modify it under the terms of the GNU General Public
   9 License as published by the Free Software Foundation; either
  10 version 3 of the License, or (at your option) any later version.
  11
  12 Libgfortran is distributed in the hope that it will be useful,
  13 but WITHOUT ANY WARRANTY; without even the implied warranty of
  14 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  15 GNU General Public License for more details.
  16
  17 Under Section 7 of GPL version 3, you are granted additional
  18 permissions described in the GCC Runtime Library Exception, version
  19 3.1, as published by the Free Software Foundation.
  20
  21 You should have received a copy of the GNU General Public License and
  22 a copy of the GCC Runtime Library Exception along with this program;
  23 see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
  24 <http://www.gnu.org/licenses/>.  */
  25
  26 #include "libgfortran.h"
  27 #include <string.h>
  28 #include <assert.h>'
  29
  30 include(iparm.m4)dnl
  31
  32 `#if defined (HAVE_'rtype_name`)
  33
  34 /* Prototype for the BLAS ?gemm subroutine, a pointer to which can be
  35    passed to us by the front-end, in which case we call it for large
  36    matrices.  */
  37
  38 typedef void (*blas_call)(const char *, const char *, const int *, const int *,
  39                           const int *, const 'rtype_name` *, const 'rtype_name` *,
  40                           const int *, const 'rtype_name` *, const int *,
  41                           const 'rtype_name` *, 'rtype_name` *, const int *,
  42                           int, int);
  43
  44 /* The order of loops is different in the case of plain matrix
  45    multiplication C=MATMUL(A,B), and in the frequent special case where
  46    the argument A is the temporary result of a TRANSPOSE intrinsic:
  47    C=MATMUL(TRANSPOSE(A),B).  Transposed temporaries are detected by
  48    looking at their strides.
  49
  50    The equivalent Fortran pseudo-code is:
  51
  52    DIMENSION A(M,COUNT), B(COUNT,N), C(M,N)
  53    IF (.NOT.IS_TRANSPOSED(A)) THEN
  54      C = 0
  55      DO J=1,N
  56        DO K=1,COUNT
  57          DO I=1,M
  58            C(I,J) = C(I,J)+A(I,K)*B(K,J)
  59    ELSE
  60      DO J=1,N
  61        DO I=1,M
  62          S = 0
  63          DO K=1,COUNT
  64            S = S+A(I,K)*B(K,J)
  65          C(I,J) = S
  66    ENDIF
  67 */
  68
  69 /* If try_blas is set to a nonzero value, then the matmul function will
  70    see if there is a way to perform the matrix multiplication by a call
  71    to the BLAS gemm function.  */
  72
  73 extern void matmul_'rtype_code` ('rtype` * const restrict retarray,
  74         'rtype` * const restrict a, 'rtype` * const restrict b, int try_blas,
  75         int blas_limit, blas_call gemm);
  76 export_proto(matmul_'rtype_code`);
  77
  78 /* Put exhaustive list of possible architectures here here, ORed together.  */
  79
  80 #if defined(HAVE_AVX) || defined(HAVE_AVX2) || defined(HAVE_AVX512F)
  81
  82 #ifdef HAVE_AVX
  83 'define(`matmul_name',`matmul_'rtype_code`_avx')dnl
  84 `static void
  85 'matmul_name` ('rtype` * const restrict retarray,
  86         'rtype` * const restrict a, 'rtype` * const restrict b, int try_blas,
  87         int blas_limit, blas_call gemm) __attribute__((__target__("avx")));
  88 static' include(matmul_internal.m4)dnl
  89 `#endif /* HAVE_AVX */
  90
  91 #ifdef HAVE_AVX2
  92 'define(`matmul_name',`matmul_'rtype_code`_avx2')dnl
  93 `static void
  94 'matmul_name` ('rtype` * const restrict retarray,
  95         'rtype` * const restrict a, 'rtype` * const restrict b, int try_blas,
  96         int blas_limit, blas_call gemm) __attribute__((__target__("avx2,fma")));
  97 static' include(matmul_internal.m4)dnl
  98 `#endif /* HAVE_AVX2 */
  99
 100 #ifdef HAVE_AVX512F
 101 'define(`matmul_name',`matmul_'rtype_code`_avx512f')dnl
 102 `static void
 103 'matmul_name` ('rtype` * const restrict retarray,
 104         'rtype` * const restrict a, 'rtype` * const restrict b, int try_blas,
 105         int blas_limit, blas_call gemm) __attribute__((__target__("avx512f")));
 106 static' include(matmul_internal.m4)dnl
 107 `#endif  /* HAVE_AVX512F */
 108
 109 /* AMD-specifix funtions with AVX128 and FMA3/FMA4.  */
 110
 111 #if defined(HAVE_AVX) && defined(HAVE_FMA3) && defined(HAVE_AVX128)
 112 'define(`matmul_name',`matmul_'rtype_code`_avx128_fma3')dnl
 113 `void
 114 'matmul_name` ('rtype` * const restrict retarray,
 115         'rtype` * const restrict a, 'rtype` * const restrict b, int try_blas,
 116         int blas_limit, blas_call gemm) __attribute__((__target__("avx,fma")));
 117 internal_proto('matmul_name`);
 118 #endif
 119
 120 #if defined(HAVE_AVX) && defined(HAVE_FMA4) && defined(HAVE_AVX128)
 121 'define(`matmul_name',`matmul_'rtype_code`_avx128_fma4')dnl
 122 `void
 123 'matmul_name` ('rtype` * const restrict retarray,
 124         'rtype` * const restrict a, 'rtype` * const restrict b, int try_blas,
 125         int blas_limit, blas_call gemm) __attribute__((__target__("avx,fma4")));
 126 internal_proto('matmul_name`);
 127 #endif
 128
 129 /* Function to fall back to if there is no special processor-specific version.  */
 130 'define(`matmul_name',`matmul_'rtype_code`_vanilla')dnl
 131 `static' include(matmul_internal.m4)dnl
 132
 133 `/* Compiling main function, with selection code for the processor.  */
 134
 135 /* Currently, this is i386 only.  Adjust for other architectures.  */
 136
 137 void matmul_'rtype_code` ('rtype` * const restrict retarray,
 138         'rtype` * const restrict a, 'rtype` * const restrict b, int try_blas,
 139         int blas_limit, blas_call gemm)
 140 {
 141   static void (*matmul_p) ('rtype` * const restrict retarray,
 142         'rtype` * const restrict a, 'rtype` * const restrict b, int try_blas,
 143         int blas_limit, blas_call gemm);
 144
 145   void (*matmul_fn) ('rtype` * const restrict retarray,
 146         'rtype` * const restrict a, 'rtype` * const restrict b, int try_blas,
 147         int blas_limit, blas_call gemm);
 148
 149   matmul_fn = __atomic_load_n (&matmul_p, __ATOMIC_RELAXED);
 150   if (matmul_fn == NULL)
 151     {
 152       matmul_fn = matmul_'rtype_code`_vanilla;
 153       if (__builtin_cpu_is ("intel"))
 154         {
 155           /* Run down the available processors in order of preference.  */
 156 #ifdef HAVE_AVX512F
 157           if (__builtin_cpu_supports ("avx512f"))
 158             {
 159               matmul_fn = matmul_'rtype_code`_avx512f;
 160               goto store;
 161             }
 162
 163 #endif  /* HAVE_AVX512F */
 164
 165 #ifdef HAVE_AVX2
 166           if (__builtin_cpu_supports ("avx2")
 167               && __builtin_cpu_supports ("fma"))
 168             {
 169               matmul_fn = matmul_'rtype_code`_avx2;
 170               goto store;
 171             }
 172
 173 #endif
 174
 175 #ifdef HAVE_AVX
 176           if (__builtin_cpu_supports ("avx"))
 177             {
 178               matmul_fn = matmul_'rtype_code`_avx;
 179               goto store;
 180             }
 181 #endif  /* HAVE_AVX */
 182         }
 183     else if (__builtin_cpu_is ("amd"))
 184       {
 185 #if defined(HAVE_AVX) && defined(HAVE_FMA3) && defined(HAVE_AVX128)
 186         if (__builtin_cpu_supports ("avx")
 187             && __builtin_cpu_supports ("fma"))
 188           {
 189             matmul_fn = matmul_'rtype_code`_avx128_fma3;
 190             goto store;
 191           }
 192 #endif
 193 #if defined(HAVE_AVX) && defined(HAVE_FMA4) && defined(HAVE_AVX128)
 194         if (__builtin_cpu_supports ("avx")
 195             && __builtin_cpu_supports ("fma4"))
 196           {
 197             matmul_fn = matmul_'rtype_code`_avx128_fma4;
 198             goto store;
 199           }
 200 #endif
 201
 202       }
 203    store:
 204       __atomic_store_n (&matmul_p, matmul_fn, __ATOMIC_RELAXED);
 205    }
 206
 207    (*matmul_fn) (retarray, a, b, try_blas, blas_limit, gemm);
 208 }
 209
 210 #else  /* Just the vanilla function.  */
 211
 212 'define(`matmul_name',`matmul_'rtype_code)dnl
 213 define(`target_attribute',`')dnl
 214 include(matmul_internal.m4)dnl
 215 `#endif
 216 #endif
 217 '