]>
Commit | Line | Data |
---|---|---|
4ee9c684 | 1 | `/* Implementation of the MATMUL intrinsic |
fbd26352 | 2 | Copyright (C) 2002-2019 Free Software Foundation, Inc. |
4ee9c684 | 3 | Contributed by Paul Brook <paul@nowt.org> |
4 | ||
553877d9 | 5 | This file is part of the GNU Fortran runtime library (libgfortran). |
4ee9c684 | 6 | |
7 | Libgfortran is free software; you can redistribute it and/or | |
b417ea8c | 8 | modify it under the terms of the GNU General Public |
4ee9c684 | 9 | License as published by the Free Software Foundation; either |
6bc9506f | 10 | version 3 of the License, or (at your option) any later version. |
4ee9c684 | 11 | |
12 | Libgfortran is distributed in the hope that it will be useful, | |
13 | but WITHOUT ANY WARRANTY; without even the implied warranty of | |
14 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | |
b417ea8c | 15 | GNU General Public License for more details. |
4ee9c684 | 16 | |
6bc9506f | 17 | Under Section 7 of GPL version 3, you are granted additional |
18 | permissions described in the GCC Runtime Library Exception, version | |
19 | 3.1, as published by the Free Software Foundation. | |
20 | ||
21 | You should have received a copy of the GNU General Public License and | |
22 | a copy of the GCC Runtime Library Exception along with this program; | |
23 | see the files COPYING3 and COPYING.RUNTIME respectively. If not, see | |
24 | <http://www.gnu.org/licenses/>. */ | |
4ee9c684 | 25 | |
41f2d5e8 | 26 | #include "libgfortran.h" |
4e867f90 | 27 | #include <string.h> |
41f2d5e8 | 28 | #include <assert.h>' |
29 | ||
cdafa1f6 | 30 | include(iparm.m4)dnl |
4ee9c684 | 31 | |
0a6b5f6b | 32 | `#if defined (HAVE_'rtype_name`) |
920e54ef | 33 | |
4e8e57b0 | 34 | /* Prototype for the BLAS ?gemm subroutine, a pointer to which can be |
a7c1a652 | 35 | passed to us by the front-end, in which case we call it for large |
4e8e57b0 | 36 | matrices. */ |
37 | ||
38 | typedef void (*blas_call)(const char *, const char *, const int *, const int *, | |
0a6b5f6b | 39 | const int *, const 'rtype_name` *, const 'rtype_name` *, |
40 | const int *, const 'rtype_name` *, const int *, | |
41 | const 'rtype_name` *, 'rtype_name` *, const int *, | |
4e8e57b0 | 42 | int, int); |
43 | ||
018ef8b8 | 44 | /* The order of loops is different in the case of plain matrix |
45 | multiplication C=MATMUL(A,B), and in the frequent special case where | |
46 | the argument A is the temporary result of a TRANSPOSE intrinsic: | |
47 | C=MATMUL(TRANSPOSE(A),B). Transposed temporaries are detected by | |
48 | looking at their strides. | |
49 | ||
50 | The equivalent Fortran pseudo-code is: | |
4e867f90 | 51 | |
52 | DIMENSION A(M,COUNT), B(COUNT,N), C(M,N) | |
018ef8b8 | 53 | IF (.NOT.IS_TRANSPOSED(A)) THEN |
54 | C = 0 | |
55 | DO J=1,N | |
56 | DO K=1,COUNT | |
57 | DO I=1,M | |
58 | C(I,J) = C(I,J)+A(I,K)*B(K,J) | |
59 | ELSE | |
60 | DO J=1,N | |
4e867f90 | 61 | DO I=1,M |
018ef8b8 | 62 | S = 0 |
63 | DO K=1,COUNT | |
4e8e57b0 | 64 | S = S+A(I,K)*B(K,J) |
018ef8b8 | 65 | C(I,J) = S |
66 | ENDIF | |
4e867f90 | 67 | */ |
68 | ||
4e8e57b0 | 69 | /* If try_blas is set to a nonzero value, then the matmul function will |
70 | see if there is a way to perform the matrix multiplication by a call | |
71 | to the BLAS gemm function. */ | |
72 | ||
0a6b5f6b | 73 | extern void matmul_'rtype_code` ('rtype` * const restrict retarray, |
74 | 'rtype` * const restrict a, 'rtype` * const restrict b, int try_blas, | |
4e8e57b0 | 75 | int blas_limit, blas_call gemm); |
0a6b5f6b | 76 | export_proto(matmul_'rtype_code`); |
7b6cb5bd | 77 | |
25df644f | 78 | /* Put exhaustive list of possible architectures here here, ORed together. */ |
c0d02c82 | 79 | |
25df644f | 80 | #if defined(HAVE_AVX) || defined(HAVE_AVX2) || defined(HAVE_AVX512F) |
a7c1a652 | 81 | |
25df644f | 82 | #ifdef HAVE_AVX |
83 | 'define(`matmul_name',`matmul_'rtype_code`_avx')dnl | |
84 | `static void | |
85 | 'matmul_name` ('rtype` * const restrict retarray, | |
86 | 'rtype` * const restrict a, 'rtype` * const restrict b, int try_blas, | |
87 | int blas_limit, blas_call gemm) __attribute__((__target__("avx"))); | |
88 | static' include(matmul_internal.m4)dnl | |
89 | `#endif /* HAVE_AVX */ | |
90 | ||
91 | #ifdef HAVE_AVX2 | |
92 | 'define(`matmul_name',`matmul_'rtype_code`_avx2')dnl | |
93 | `static void | |
94 | 'matmul_name` ('rtype` * const restrict retarray, | |
95 | 'rtype` * const restrict a, 'rtype` * const restrict b, int try_blas, | |
69077976 | 96 | int blas_limit, blas_call gemm) __attribute__((__target__("avx2,fma"))); |
25df644f | 97 | static' include(matmul_internal.m4)dnl |
98 | `#endif /* HAVE_AVX2 */ | |
99 | ||
100 | #ifdef HAVE_AVX512F | |
101 | 'define(`matmul_name',`matmul_'rtype_code`_avx512f')dnl | |
102 | `static void | |
103 | 'matmul_name` ('rtype` * const restrict retarray, | |
104 | 'rtype` * const restrict a, 'rtype` * const restrict b, int try_blas, | |
105 | int blas_limit, blas_call gemm) __attribute__((__target__("avx512f"))); | |
106 | static' include(matmul_internal.m4)dnl | |
107 | `#endif /* HAVE_AVX512F */ | |
4ee9c684 | 108 | |
b4e409cb | 109 | /* AMD-specifix funtions with AVX128 and FMA3/FMA4. */ |
110 | ||
111 | #if defined(HAVE_AVX) && defined(HAVE_FMA3) && defined(HAVE_AVX128) | |
112 | 'define(`matmul_name',`matmul_'rtype_code`_avx128_fma3')dnl | |
113 | `void | |
114 | 'matmul_name` ('rtype` * const restrict retarray, | |
115 | 'rtype` * const restrict a, 'rtype` * const restrict b, int try_blas, | |
116 | int blas_limit, blas_call gemm) __attribute__((__target__("avx,fma"))); | |
117 | internal_proto('matmul_name`); | |
118 | #endif | |
119 | ||
120 | #if defined(HAVE_AVX) && defined(HAVE_FMA4) && defined(HAVE_AVX128) | |
121 | 'define(`matmul_name',`matmul_'rtype_code`_avx128_fma4')dnl | |
122 | `void | |
123 | 'matmul_name` ('rtype` * const restrict retarray, | |
124 | 'rtype` * const restrict a, 'rtype` * const restrict b, int try_blas, | |
125 | int blas_limit, blas_call gemm) __attribute__((__target__("avx,fma4"))); | |
126 | internal_proto('matmul_name`); | |
127 | #endif | |
128 | ||
25df644f | 129 | /* Function to fall back to if there is no special processor-specific version. */ |
130 | 'define(`matmul_name',`matmul_'rtype_code`_vanilla')dnl | |
131 | `static' include(matmul_internal.m4)dnl | |
4e867f90 | 132 | |
25df644f | 133 | `/* Compiling main function, with selection code for the processor. */ |
4e867f90 | 134 | |
25df644f | 135 | /* Currently, this is i386 only. Adjust for other architectures. */ |
4e867f90 | 136 | |
25df644f | 137 | #include <config/i386/cpuinfo.h> |
138 | void matmul_'rtype_code` ('rtype` * const restrict retarray, | |
139 | 'rtype` * const restrict a, 'rtype` * const restrict b, int try_blas, | |
140 | int blas_limit, blas_call gemm) | |
141 | { | |
142 | static void (*matmul_p) ('rtype` * const restrict retarray, | |
143 | 'rtype` * const restrict a, 'rtype` * const restrict b, int try_blas, | |
89594431 | 144 | int blas_limit, blas_call gemm); |
145 | ||
146 | void (*matmul_fn) ('rtype` * const restrict retarray, | |
147 | 'rtype` * const restrict a, 'rtype` * const restrict b, int try_blas, | |
148 | int blas_limit, blas_call gemm); | |
4e867f90 | 149 | |
89594431 | 150 | matmul_fn = __atomic_load_n (&matmul_p, __ATOMIC_RELAXED); |
151 | if (matmul_fn == NULL) | |
a7c1a652 | 152 | { |
89594431 | 153 | matmul_fn = matmul_'rtype_code`_vanilla; |
25df644f | 154 | if (__cpu_model.__cpu_vendor == VENDOR_INTEL) |
4e867f90 | 155 | { |
25df644f | 156 | /* Run down the available processors in order of preference. */ |
157 | #ifdef HAVE_AVX512F | |
158 | if (__cpu_model.__cpu_features[0] & (1 << FEATURE_AVX512F)) | |
4e867f90 | 159 | { |
89594431 | 160 | matmul_fn = matmul_'rtype_code`_avx512f; |
161 | goto store; | |
4e867f90 | 162 | } |
018ef8b8 | 163 | |
25df644f | 164 | #endif /* HAVE_AVX512F */ |
dd5b9961 | 165 | |
25df644f | 166 | #ifdef HAVE_AVX2 |
69077976 | 167 | if ((__cpu_model.__cpu_features[0] & (1 << FEATURE_AVX2)) |
168 | && (__cpu_model.__cpu_features[0] & (1 << FEATURE_FMA))) | |
018ef8b8 | 169 | { |
89594431 | 170 | matmul_fn = matmul_'rtype_code`_avx2; |
171 | goto store; | |
018ef8b8 | 172 | } |
609b676a | 173 | |
25df644f | 174 | #endif |
018ef8b8 | 175 | |
25df644f | 176 | #ifdef HAVE_AVX |
177 | if (__cpu_model.__cpu_features[0] & (1 << FEATURE_AVX)) | |
178 | { | |
89594431 | 179 | matmul_fn = matmul_'rtype_code`_avx; |
180 | goto store; | |
018ef8b8 | 181 | } |
25df644f | 182 | #endif /* HAVE_AVX */ |
183 | } | |
b4e409cb | 184 | else if (__cpu_model.__cpu_vendor == VENDOR_AMD) |
185 | { | |
186 | #if defined(HAVE_AVX) && defined(HAVE_FMA3) && defined(HAVE_AVX128) | |
187 | if ((__cpu_model.__cpu_features[0] & (1 << FEATURE_AVX)) | |
188 | && (__cpu_model.__cpu_features[0] & (1 << FEATURE_FMA))) | |
189 | { | |
190 | matmul_fn = matmul_'rtype_code`_avx128_fma3; | |
191 | goto store; | |
192 | } | |
193 | #endif | |
194 | #if defined(HAVE_AVX) && defined(HAVE_FMA4) && defined(HAVE_AVX128) | |
195 | if ((__cpu_model.__cpu_features[0] & (1 << FEATURE_AVX)) | |
196 | && (__cpu_model.__cpu_features[0] & (1 << FEATURE_FMA4))) | |
197 | { | |
198 | matmul_fn = matmul_'rtype_code`_avx128_fma4; | |
199 | goto store; | |
200 | } | |
201 | #endif | |
202 | ||
203 | } | |
89594431 | 204 | store: |
205 | __atomic_store_n (&matmul_p, matmul_fn, __ATOMIC_RELAXED); | |
25df644f | 206 | } |
207 | ||
89594431 | 208 | (*matmul_fn) (retarray, a, b, try_blas, blas_limit, gemm); |
25df644f | 209 | } |
210 | ||
211 | #else /* Just the vanilla function. */ | |
212 | ||
213 | 'define(`matmul_name',`matmul_'rtype_code)dnl | |
214 | define(`target_attribute',`')dnl | |
215 | include(matmul_internal.m4)dnl | |
216 | `#endif | |
a7c1a652 | 217 | #endif |
25df644f | 218 | ' |