Tpetra parallel linear algebra  Version of the Day
Kokkos_MV_GEMM.hpp
1 /*
2 //@HEADER
3 // ************************************************************************
4 //
5 // Kokkos: Node API and Parallel Node Kernels
6 // Copyright (2008) Sandia Corporation
7 //
8 // Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
9 // the U.S. Government retains certain rights in this software.
10 //
11 // Redistribution and use in source and binary forms, with or without
12 // modification, are permitted provided that the following conditions are
13 // met:
14 //
15 // 1. Redistributions of source code must retain the above copyright
16 // notice, this list of conditions and the following disclaimer.
17 //
18 // 2. Redistributions in binary form must reproduce the above copyright
19 // notice, this list of conditions and the following disclaimer in the
20 // documentation and/or other materials provided with the distribution.
21 //
22 // 3. Neither the name of the Corporation nor the names of the
23 // contributors may be used to endorse or promote products derived from
24 // this software without specific prior written permission.
25 //
26 // THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
27 // EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
28 // IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
29 // PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
30 // CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
31 // EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
32 // PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
33 // PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
34 // LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
35 // NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
36 // SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
37 //
38 // Questions? Contact Michael A. Heroux (maherou@sandia.gov)
39 //
40 // ************************************************************************
41 //@HEADER
42 */
43 
44 #ifndef KOKKOS_MV_GEMM_HPP
45 #define KOKKOS_MV_GEMM_HPP
46 
47 // Note this code lives only temporarily in TpetraCore. As soon as
48 // GEMM kernels exist in the TpetraKernels subpackage, and thus a
49 // dependency on Teuchos can be eliminated, the code will move to
50 // TpetraKernels.
51 
52 #include <Teuchos_BLAS.hpp>
53 #include <Kokkos_Blas2_MV.hpp>
54 
55 #ifdef KOKKOS_HAVE_CUDA
56 #include <cublas.h>
57 #endif
58 
59 namespace Teuchos {
60 
61  // mfh 11 Nov 2014: The DeviceGEMM specializations below need to be
62  // able to use Teuchos::BLAS::{GEMM, GEMV}. We provide just enough
63  // of a specialization for Kokkos::complex<{float, double}> to make
64  // DeviceGEMM work. They just defer to BLAS<int,
65  // std::complex<{float, double}> > via reinterpret_cast. Please
66  // feel free to expand these specializations if you need to.
67 
68  template<>
69  class BLAS<int, ::Kokkos::complex<float> > {
70  public:
71  typedef float mag_type;
72  typedef ::Kokkos::complex<float> val_type;
73  typedef std::complex<float> impl_type;
74 
75  BLAS () {}
76  BLAS (const BLAS<int, val_type>&) {}
77  virtual ~BLAS () {}
78 
79  // void ROTG (val_type* da, val_type* db, mag_type* c, val_type* s) const;
80  // void ROT (const int n, val_type* dx, const int incx, val_type* dy, const int incy, RealType* c, val_type* s) const;
81  // RealType ASUM (const int n, const val_type* x, const int incx) const;
82  //void AXPY (const int n, const val_type alpha, const val_type* x, const int incx, val_type* y, const int incy) const;
83  //void COPY (const int n, const val_type* x, const int incx, val_type* y, const int incy) const;
84  //val_type DOT(const int n, const val_type* x, const int incx, const val_type* y, const int incy) const;
85  //RealType NRM2(const int n, const val_type* x, const int incx) const;
86  //void SCAL(const int n, const val_type alpha, val_type* x, const int incx) const;
87  //int IAMAX(const int n, const val_type* x, const int incx) const;
88 
89  void
90  GEMV (ETransp trans, const int m, const int n, const val_type alpha,
91  const val_type* A, const int lda, const val_type* x, const int incx,
92  const val_type beta, val_type* y, const int incy) const
93  {
94  BLAS<int, impl_type> blas;
95  blas.GEMV (trans, m, n, static_cast<impl_type> (alpha),
96  reinterpret_cast<const impl_type*> (A), lda,
97  reinterpret_cast<const impl_type*> (x), incx,
98  static_cast<impl_type> (beta),
99  reinterpret_cast<impl_type*> (y), incy);
100  }
101 
102  //void TRMV(EUplo uplo, ETransp trans, EDiag diag, const int n, const val_type* A, const int lda, val_type* x, const int incx) const;
103  //void GER(const int m, const int n, const val_type alpha, const val_type* x, const int incx, const val_type* y, const int incy, val_type* A, const int lda) const;
104 
105  void
106  GEMM (ETransp transa, ETransp transb, const int m, const int n, const int k,
107  const val_type alpha, const val_type* A, const int lda,
108  const val_type* B, const int ldb, const val_type beta, val_type* C,
109  const int ldc) const
110  {
111  BLAS<int, impl_type> blas;
112  blas.GEMM (transa, transb, m, n, k,
113  static_cast<impl_type> (alpha),
114  reinterpret_cast<const impl_type*> (A), lda,
115  reinterpret_cast<const impl_type*> (B), ldb,
116  static_cast<impl_type> (beta),
117  reinterpret_cast<impl_type*> (C), ldc);
118  }
119 
120  //void SYMM(ESide side, EUplo uplo, const int m, const int n, const val_type alpha, const val_type* A, const int lda, const val_type *B, const int ldb, const val_type beta, val_type *C, const int ldc) const;
121  //void SYRK(EUplo uplo, ETransp trans, const int n, const int k, const val_type alpha, const val_type* A, const int lda, const val_type beta, val_type* C, const int ldc) const;
122  //void TRMM(ESide side, EUplo uplo, ETransp transa, EDiag diag, const int m, const int n, const val_type alpha, const val_type* A, const int lda, val_type* B, const int ldb) const;
123  //void TRSM(ESide side, EUplo uplo, ETransp transa, EDiag diag, const int m, const int n, const val_type alpha, const val_type* A, const int lda, val_type* B, const int ldb) const;
124  };
125 
126  template<>
127  class BLAS<int, ::Kokkos::complex<double> > {
128  public:
129  typedef double mag_type;
130  typedef ::Kokkos::complex<double> val_type;
131  typedef std::complex<double> impl_type;
132 
133  BLAS () {}
134  BLAS (const BLAS<int, val_type>&) {}
135  virtual ~BLAS () {}
136 
137  // void ROTG (val_type* da, val_type* db, mag_type* c, val_type* s) const;
138  // void ROT (const int n, val_type* dx, const int incx, val_type* dy, const int incy, RealType* c, val_type* s) const;
139  // RealType ASUM (const int n, const val_type* x, const int incx) const;
140  //void AXPY (const int n, const val_type alpha, const val_type* x, const int incx, val_type* y, const int incy) const;
141  //void COPY (const int n, const val_type* x, const int incx, val_type* y, const int incy) const;
142  //val_type DOT(const int n, const val_type* x, const int incx, const val_type* y, const int incy) const;
143  //RealType NRM2(const int n, const val_type* x, const int incx) const;
144  //void SCAL(const int n, const val_type alpha, val_type* x, const int incx) const;
145  //int IAMAX(const int n, const val_type* x, const int incx) const;
146 
147  void
148  GEMV (ETransp trans, const int m, const int n, const val_type alpha,
149  const val_type* A, const int lda, const val_type* x, const int incx,
150  const val_type beta, val_type* y, const int incy) const
151  {
152  BLAS<int, impl_type> blas;
153  blas.GEMV (trans, m, n, static_cast<impl_type> (alpha),
154  reinterpret_cast<const impl_type*> (A), lda,
155  reinterpret_cast<const impl_type*> (x), incx,
156  static_cast<impl_type> (beta),
157  reinterpret_cast<impl_type*> (y), incy);
158  }
159 
160  //void TRMV(EUplo uplo, ETransp trans, EDiag diag, const int n, const val_type* A, const int lda, val_type* x, const int incx) const;
161  //void GER(const int m, const int n, const val_type alpha, const val_type* x, const int incx, const val_type* y, const int incy, val_type* A, const int lda) const;
162 
163  void
164  GEMM (ETransp transa, ETransp transb, const int m, const int n, const int k,
165  const val_type alpha, const val_type* A, const int lda,
166  const val_type* B, const int ldb, const val_type beta, val_type* C,
167  const int ldc) const
168  {
169  BLAS<int, impl_type> blas;
170  blas.GEMM (transa, transb, m, n, k,
171  static_cast<impl_type> (alpha),
172  reinterpret_cast<const impl_type*> (A), lda,
173  reinterpret_cast<const impl_type*> (B), ldb,
174  static_cast<impl_type> (beta),
175  reinterpret_cast<impl_type*> (C), ldc);
176  }
177 
178  //void SYMM(ESide side, EUplo uplo, const int m, const int n, const val_type alpha, const val_type* A, const int lda, const val_type *B, const int ldb, const val_type beta, val_type *C, const int ldc) const;
179  //void SYRK(EUplo uplo, ETransp trans, const int n, const int k, const val_type alpha, const val_type* A, const int lda, const val_type beta, val_type* C, const int ldc) const;
180  //void TRMM(ESide side, EUplo uplo, ETransp transa, EDiag diag, const int m, const int n, const val_type alpha, const val_type* A, const int lda, val_type* B, const int ldb) const;
181  //void TRSM(ESide side, EUplo uplo, ETransp transa, EDiag diag, const int m, const int n, const val_type alpha, const val_type* A, const int lda, val_type* B, const int ldb) const;
182  };
183 
184 } // namespace Teuchos
185 
186 
187 namespace Kokkos {
188  namespace Impl {
189 
190  template<class ViewType>
191  size_t getStride2DView (ViewType A) {
192  size_t stride[8];
193  A.stride (stride);
194  return A.dimension_1 () > 1 ? stride[1] : A.dimension_0 ();
195  }
196  }
197 
204  template <typename Scalar, typename DeviceType>
205  struct DeviceGEMM {
206  public:
207  static void
208  GEMM (const Teuchos::ETransp transA,
209  const Teuchos::ETransp transB,
210  const Scalar& alpha,
211  const View<const Scalar**, LayoutLeft, DeviceType>& A,
212  const View<const Scalar**, LayoutLeft, DeviceType>& B,
213  const Scalar& beta,
214  const View<Scalar**, LayoutLeft, DeviceType>& C)
215  {
216  const int n = static_cast<int> (C.dimension_1 ());
217  const int lda = static_cast<int> (Impl::getStride2DView (A));
218  Teuchos::BLAS<int,Scalar> blas;
219 
220  // For some BLAS implementations (e.g., MKL), GEMM when B has
221  // one column may be signficantly less efficient than GEMV.
222  if (n == 1 && transB == Teuchos::NO_TRANS) {
223  blas.GEMV (transA, A.dimension_0 (), A.dimension_1 (),
224  alpha, A.ptr_on_device (), lda,
225  B.ptr_on_device (), static_cast<int> (1),
226  beta, C.ptr_on_device (), static_cast<int> (1));
227  }
228  else {
229  const int m = static_cast<int> (C.dimension_0 ());
230  const int k = static_cast<int> (transA == Teuchos::NO_TRANS ?
231  A.dimension_1 () : A.dimension_0 ());
232  const int ldb = static_cast<int> (Impl::getStride2DView (B));
233  const int ldc = static_cast<int> (Impl::getStride2DView (C));
234 
235  blas.GEMM (transA, transB, m, n, k, alpha,
236  A.ptr_on_device(), lda,
237  B.ptr_on_device(), ldb,
238  beta, C.ptr_on_device(), ldc);
239  }
240  }
241  };
242 
243  // FIXME (mfh 10 May 2016) Temporary work-around for #243.
244  // Don't call MKL for this case.
245 #ifdef HAVE_TPETRAKERNELS_MKL
246  template <typename DeviceType>
247  struct DeviceGEMM<double, DeviceType> {
248  public:
249  static void
250  GEMM (const Teuchos::ETransp transA,
251  const Teuchos::ETransp transB,
252  const double& alpha,
253  const View<const double**, LayoutLeft, DeviceType>& A,
254  const View<const double**, LayoutLeft, DeviceType>& B,
255  const double& beta,
256  const View<double**, LayoutLeft, DeviceType>& C)
257  {
258  const int n = static_cast<int> (C.dimension_1 ());
259 
260  // For some BLAS implementations (e.g., MKL), GEMM when B has
261  // one column may be signficantly less efficient than GEMV.
262  if (n == 1 && transB == Teuchos::NO_TRANS) {
263  char trans = 'N';
264  if (transA == Teuchos::TRANS) {
265  trans = 'T';
266  }
267  else if (transA == Teuchos::CONJ_TRANS) {
268  trans = 'C';
269  }
270  auto B_0 = Kokkos::subview (B, Kokkos::ALL (), 0);
271  auto C_0 = Kokkos::subview (C, Kokkos::ALL (), 0);
272  KokkosBlas::gemv (&trans, alpha, A, B_0, beta, C_0);
273  }
274  else {
275  const int m = static_cast<int> (C.dimension_0 ());
276  const int k = static_cast<int> (transA == Teuchos::NO_TRANS ? A.dimension_1 () : A.dimension_0 ());
277  const int lda = static_cast<int> (Impl::getStride2DView (A));
278  const int ldb = static_cast<int> (Impl::getStride2DView (B));
279  const int ldc = static_cast<int> (Impl::getStride2DView (C));
280 
281  Teuchos::BLAS<int,double> blas;
282  blas.GEMM (transA, transB, m, n, k, alpha,
283  A.ptr_on_device(), lda,
284  B.ptr_on_device(), ldb,
285  beta, C.ptr_on_device(), ldc);
286  }
287  }
288  };
289 #endif // HAVE_TPETRAKERNELS_MKL
290 
291 #ifdef KOKKOS_HAVE_CUDA
292  template <typename Scalar>
293  struct DeviceGEMM<Scalar,Cuda> {
294  public:
295  static void
296  GEMM (const Teuchos::ETransp transA,
297  const Teuchos::ETransp transB,
298  const Scalar& alpha,
299  const View<const Scalar**, LayoutLeft, Cuda>& A,
300  const View<const Scalar**,LayoutLeft,Cuda>& B,
301  const Scalar& beta,
302  const View<Scalar**,LayoutLeft,Cuda>& C)
303  {
304  TEUCHOS_TEST_FOR_EXCEPTION
305  (true, std::logic_error, "DeviceGEMM: Kokkos::Cuda has no support "
306  "for GEMM operations over Scalar=" << Teuchos::typeName(alpha) << ".");
307  }
308  };
309 
310  template <>
311  struct DeviceGEMM<float,Cuda> {
312  public:
313  static void
314  GEMM (const Teuchos::ETransp transA,
315  const Teuchos::ETransp transB,
316  const float alpha,
317  const View<const float**,LayoutLeft,Cuda>& A,
318  const View<const float**,LayoutLeft,Cuda>& B,
319  const float beta,
320  const View<float**,LayoutLeft,Cuda>& C)
321  {
322  const int m = static_cast<int>(C.dimension_0()),
323  n = static_cast<int>(C.dimension_1()),
324  k = (transA == Teuchos::NO_TRANS ? A.dimension_1() : A.dimension_0()),
325  lda = static_cast<int>(Impl::getStride2DView(A)),
326  ldb = static_cast<int>(Impl::getStride2DView(B)),
327  ldc = static_cast<int>(Impl::getStride2DView(C));
328  const char char_transA = (transA == Teuchos::NO_TRANS ? 'N' : 'T'),
329  char_transB = (transB == Teuchos::NO_TRANS ? 'N' : 'T');
330  cublasSgemm (char_transA, char_transB, m, n, k, alpha,
331  A.ptr_on_device(), lda, B.ptr_on_device(),
332  ldb, beta, C.ptr_on_device(), ldc);
333 
334 #ifdef HAVE_KOKKOS_DEBUG
335  const cublasStatus info = cublasGetError ();
336  TEUCHOS_TEST_FOR_EXCEPTION
337  (info != CUBLAS_STATUS_SUCCESS, std::runtime_error,
338  "cublasSgemm failed with status " << info << "." );
339 #endif // HAVE_KOKKOS_DEBUG
340  }
341  };
342 
343  template <>
344  struct DeviceGEMM<double,Cuda> {
345  public:
346  static void GEMM(Teuchos::ETransp transA, Teuchos::ETransp transB, double alpha,
347  View<const double**,LayoutLeft,Cuda> A, View<const double**,LayoutLeft,Cuda> B,
348  double beta, View<double**,LayoutLeft,Cuda> C) {
349  const int m = static_cast<int>(C.dimension_0()),
350  n = static_cast<int>(C.dimension_1()),
351  k = (transA == Teuchos::NO_TRANS ? A.dimension_1() : A.dimension_0()),
352  lda = static_cast<int>(Impl::getStride2DView(A)),
353  ldb = static_cast<int>(Impl::getStride2DView(B)),
354  ldc = static_cast<int>(Impl::getStride2DView(C));
355  const char char_transA = (transA == Teuchos::NO_TRANS ? 'N' : 'T'),
356  char_transB = (transB == Teuchos::NO_TRANS ? 'N' : 'T');
357  cublasDgemm(char_transA, char_transB, m, n, k, alpha, A.ptr_on_device(), lda, B.ptr_on_device(), ldb, beta, C.ptr_on_device(), ldc);
358 #ifdef HAVE_KOKKOS_DEBUG
359  cublasStatus info = cublasGetError();
360  TEUCHOS_TEST_FOR_EXCEPTION( info != CUBLAS_STATUS_SUCCESS, std::runtime_error, "cublasDgemm failed with status " << info << "." );
361 #endif
362  }
363  };
364 
365 
366 #endif
367 }
368 #endif // KOKKOS_MV_GEMM_HPP
369 
KOKKOS_INLINE_FUNCTION void GEMV(const CoeffType &alpha, const BlkType &A, const VecType1 &x, const VecType2 &y)
y := y + alpha * A * x (dense matrix-vector multiply)
KOKKOS_INLINE_FUNCTION void GEMM(const char transA[], const char transB[], const CoefficientType &alpha, const ViewType1 &A, const ViewType2 &B, const CoefficientType &beta, const ViewType3 &C)
Small dense matrix-matrix multiply: C := alpha*A*B + beta*C
Class that provides GEMM for a particular Kokkos Device.