Tpetra parallel linear algebra  Version of the Day
Tpetra_Experimental_BlockCrsMatrix_def.hpp
Go to the documentation of this file.
1 // @HEADER
2 // ***********************************************************************
3 //
4 // Tpetra: Templated Linear Algebra Services Package
5 // Copyright (2008) Sandia Corporation
6 //
7 // Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
8 // the U.S. Government retains certain rights in this software.
9 //
10 // Redistribution and use in source and binary forms, with or without
11 // modification, are permitted provided that the following conditions are
12 // met:
13 //
14 // 1. Redistributions of source code must retain the above copyright
15 // notice, this list of conditions and the following disclaimer.
16 //
17 // 2. Redistributions in binary form must reproduce the above copyright
18 // notice, this list of conditions and the following disclaimer in the
19 // documentation and/or other materials provided with the distribution.
20 //
21 // 3. Neither the name of the Corporation nor the names of the
22 // contributors may be used to endorse or promote products derived from
23 // this software without specific prior written permission.
24 //
25 // THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
26 // EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
27 // IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
28 // PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
29 // CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
30 // EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
31 // PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
32 // PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
33 // LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
34 // NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
35 // SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
36 //
37 // Questions? Contact Michael A. Heroux (maherou@sandia.gov)
38 //
39 // ************************************************************************
40 // @HEADER
41 
42 #ifndef TPETRA_EXPERIMENTAL_BLOCKCRSMATRIX_DEF_HPP
43 #define TPETRA_EXPERIMENTAL_BLOCKCRSMATRIX_DEF_HPP
44 
47 
50 #include "Teuchos_TimeMonitor.hpp"
51 
52 //
53 // mfh 25 May 2016: Temporary fix for #393.
54 //
55 // Don't use lambdas in the BCRS mat-vec for GCC < 4.8, due to a GCC
56 // 4.7.2 compiler bug ("internal compiler error") when compiling them.
57 // Also, lambdas for Kokkos::parallel_* don't work with CUDA, so don't
58 // use them in that case, either.
59 //
60 // mfh 31 May 2016: GCC 4.9.[23] appears to be broken ("internal
61 // compiler error") too. Ditto for GCC 5.1. I'll just disable the
62 // thing for any GCC version.
63 //
64 #if defined(__CUDACC__)
65  // Lambdas for Kokkos::parallel_* don't work with CUDA 7.5 either.
66 # if defined(TPETRA_BLOCKCRSMATRIX_APPLY_USE_LAMBDA)
67 # undef TPETRA_BLOCKCRSMATRIX_APPLY_USE_LAMBDA
68 # endif // defined(TPETRA_BLOCKCRSMATRIX_APPLY_USE_LAMBDA)
69 
70 #elif defined(__GNUC__)
71 
72 # if defined(TPETRA_BLOCKCRSMATRIX_APPLY_USE_LAMBDA)
73 # undef TPETRA_BLOCKCRSMATRIX_APPLY_USE_LAMBDA
74 # endif // defined(TPETRA_BLOCKCRSMATRIX_APPLY_USE_LAMBDA)
75 
76 #else // some other compiler
77 
78  // Optimistically assume that other compilers aren't broken.
79 # if ! defined(TPETRA_BLOCKCRSMATRIX_APPLY_USE_LAMBDA)
80 # define TPETRA_BLOCKCRSMATRIX_APPLY_USE_LAMBDA 1
81 # endif // ! defined(TPETRA_BLOCKCRSMATRIX_APPLY_USE_LAMBDA)
82 #endif // defined(__CUDACC__), defined(__GNUC__)
83 
84 
85 namespace Tpetra {
86 namespace Experimental {
87 
88 namespace Impl {
89 
90 #if 0
91 template<class AlphaCoeffType,
92  class GraphType,
93  class MatrixValuesType,
94  class InVecType,
95  class BetaCoeffType,
96  class OutVecType>
97 class BcrsApplyNoTrans1VecTeamFunctor {
98 private:
99  static_assert (Kokkos::Impl::is_view<MatrixValuesType>::value,
100  "MatrixValuesType must be a Kokkos::View.");
101  static_assert (Kokkos::Impl::is_view<OutVecType>::value,
102  "OutVecType must be a Kokkos::View.");
103  static_assert (Kokkos::Impl::is_view<InVecType>::value,
104  "InVecType must be a Kokkos::View.");
105  static_assert (std::is_same<MatrixValuesType,
106  typename MatrixValuesType::const_type>::value,
107  "MatrixValuesType must be a const Kokkos::View.");
108  static_assert (std::is_same<OutVecType,
109  typename OutVecType::non_const_type>::value,
110  "OutVecType must be a nonconst Kokkos::View.");
111  static_assert (std::is_same<InVecType, typename InVecType::const_type>::value,
112  "InVecType must be a const Kokkos::View.");
113  static_assert (static_cast<int> (MatrixValuesType::rank) == 1,
114  "MatrixValuesType must be a rank-1 Kokkos::View.");
115  static_assert (static_cast<int> (InVecType::rank) == 1,
116  "InVecType must be a rank-1 Kokkos::View.");
117  static_assert (static_cast<int> (OutVecType::rank) == 1,
118  "OutVecType must be a rank-1 Kokkos::View.");
119  typedef typename MatrixValuesType::non_const_value_type scalar_type;
120  typedef typename GraphType::device_type device_type;
121  typedef typename device_type::execution_space execution_space;
122  typedef typename execution_space::scratch_memory_space shmem_space;
123 
124 public:
126  typedef typename std::remove_const<typename GraphType::data_type>::type
129  typedef Kokkos::TeamPolicy<Kokkos::Schedule<Kokkos::Dynamic>,
130  execution_space,
131  local_ordinal_type> policy_type;
138  void setX (const InVecType& X) { X_ = X; }
139 
146  void setY (const OutVecType& Y) { Y_ = Y; }
147 
151  KOKKOS_INLINE_FUNCTION local_ordinal_type
152  getScratchSizePerTeam () const
153  {
154  // WARNING (mfh 01 Jun 2016) This may not work for Scalar types
155  // that have run-time sizes, like some of those in Stokhos.
156  typedef typename std::decay<decltype (Y_(0))>::type y_value_type;
157  return blockSize_ * sizeof (y_value_type);
158  }
159 
163  KOKKOS_INLINE_FUNCTION local_ordinal_type
164  getScratchSizePerThread () const
165  {
166  // WARNING (mfh 01 Jun 2016) This may not work for Scalar types
167  // that have run-time sizes, like some of those in Stokhos.
168  typedef typename std::decay<decltype (Y_(0))>::type y_value_type;
169  return blockSize_ * sizeof (y_value_type);
170  }
171 
172 private:
173  KOKKOS_INLINE_FUNCTION local_ordinal_type
174  getNumLclMeshRows () const
175  {
176  return ptr_.dimension_0 () == 0 ?
177  static_cast<local_ordinal_type> (0) :
178  static_cast<local_ordinal_type> (ptr_.dimension_0 () - 1);
179  }
180 
181  static constexpr local_ordinal_type defaultRowsPerTeam = 20;
182 
183 public:
185  local_ordinal_type getNumTeams () const {
186  return 1;
187  // const local_ordinal_type numLclMeshRows = getNumLclMeshRows ();
188  // return (numLclMeshRows + rowsPerTeam_ - 1) / rowsPerTeam_;
189  }
190 
192  BcrsApplyNoTrans1VecTeamFunctor (const typename std::decay<AlphaCoeffType>::type& alpha,
193  const GraphType& graph,
194  const MatrixValuesType& val,
195  const local_ordinal_type blockSize,
196  const InVecType& X,
197  const typename std::decay<BetaCoeffType>::type& beta,
198  const OutVecType& Y,
199  const local_ordinal_type rowsPerTeam = defaultRowsPerTeam) :
200  alpha_ (alpha),
201  ptr_ (graph.row_map),
202  ind_ (graph.entries),
203  val_ (val),
204  blockSize_ (blockSize),
205  X_ (X),
206  beta_ (beta),
207  Y_ (Y),
208  rowsPerTeam_ (rowsPerTeam)
209  {
210  // std::ostringstream os;
211  // os << "Functor ctor: numLclMeshRows = " << getNumLclMeshRows () << std::endl;
212  // std::cerr << os.str ();
213  }
214 
215  KOKKOS_INLINE_FUNCTION void
216  operator () (const typename policy_type::member_type& member) const
217  {
222  using Kokkos::Details::ArithTraits;
223  // I'm not writing 'using Kokkos::make_pair;' here, because that
224  // may break builds for users who make the mistake of putting
225  // 'using namespace std;' in the global namespace. Please don't
226  // ever do that! But just in case you do, I'll take this
227  // precaution.
228  using Kokkos::parallel_for;
229  using Kokkos::subview;
230  typedef local_ordinal_type LO;
231  typedef typename decltype (ptr_)::non_const_value_type offset_type;
232  typedef Kokkos::View<typename OutVecType::non_const_value_type*,
233  shmem_space, Kokkos::MemoryTraits<Kokkos::Unmanaged> >
234  shared_array_type;
235  typedef Kokkos::View<typename OutVecType::non_const_value_type*,
236  Kokkos::LayoutRight,
237  device_type,
238  Kokkos::MemoryTraits<Kokkos::Unmanaged> >
239  out_little_vec_type;
240  typedef Kokkos::View<typename MatrixValuesType::const_value_type**,
241  Kokkos::LayoutRight,
242  device_type,
243  Kokkos::MemoryTraits<Kokkos::Unmanaged> >
244  little_block_type;
245 
246  const LO leagueRank = member.league_rank();
247 
248  // This looks wrong! All the threads are writing into the same local storage.
249  // shared_array_type threadLocalMem =
250  // shared_array_type (member.thread_scratch (1), blockSize_);
251  shared_array_type threadLocalMem =
252  shared_array_type (member.thread_scratch (1), blockSize_ * rowsPerTeam_);
253 
254  // This looks wrong! All the threads are writing into the same local storage.
255  //out_little_vec_type Y_tlm (threadLocalMem.ptr_on_device (), blockSize_, 1);
256 
257  const LO numLclMeshRows = getNumLclMeshRows ();
258  const LO rowBeg = leagueRank * rowsPerTeam_;
259  const LO rowTmp = rowBeg + rowsPerTeam_;
260  const LO rowEnd = rowTmp < numLclMeshRows ? rowTmp : numLclMeshRows;
261 
262  // {
263  // std::ostringstream os;
264  // os << leagueRank << "," << member.team_rank () << ": "
265  // << rowBeg << "," << rowEnd << std::endl;
266  // std::cerr << os.str ();
267  // }
268 
269  // Each team takes rowsPerTeam_ (block) rows.
270  // Each thread in the team takes a single (block) row.
271  parallel_for (Kokkos::TeamThreadRange (member, rowBeg, rowEnd),
272  [&] (const LO& lclRow) {
273  // Each thread in the team gets its own temp storage.
274  out_little_vec_type Y_tlm (threadLocalMem.ptr_on_device () + blockSize_ * member.team_rank (), blockSize_);
275 
276  const offset_type Y_ptBeg = lclRow * blockSize_;
277  const offset_type Y_ptEnd = Y_ptBeg + blockSize_;
278  auto Y_cur =
279  subview (Y_, ::Kokkos::make_pair (Y_ptBeg, Y_ptEnd));
280  if (beta_ == ArithTraits<BetaCoeffType>::zero ()) {
281  FILL (Y_tlm, ArithTraits<BetaCoeffType>::zero ());
282  }
283  else if (beta_ == ArithTraits<BetaCoeffType>::one ()) {
284  COPY (Y_cur, Y_tlm);
285  }
286  else { // beta != 0 && beta != 1
287  COPY (Y_cur, Y_tlm);
288  SCAL (beta_, Y_tlm);
289  }
290 
291  if (alpha_ != ArithTraits<AlphaCoeffType>::zero ()) {
292  const offset_type blkBeg = ptr_[lclRow];
293  const offset_type blkEnd = ptr_[lclRow+1];
294  // Precompute to save integer math in the inner loop.
295  const offset_type bs2 = blockSize_ * blockSize_;
296  for (offset_type absBlkOff = blkBeg; absBlkOff < blkEnd;
297  ++absBlkOff) {
298  little_block_type A_cur (val_.ptr_on_device () + absBlkOff * bs2,
299  blockSize_, blockSize_);
300  const offset_type X_blkCol = ind_[absBlkOff];
301  const offset_type X_ptBeg = X_blkCol * blockSize_;
302  const offset_type X_ptEnd = X_ptBeg + blockSize_;
303  auto X_cur =
304  subview (X_, ::Kokkos::make_pair (X_ptBeg, X_ptEnd));
305  // Y_tlm += alpha*A_cur*X_cur
306  GEMV (alpha_, A_cur, X_cur, Y_tlm);
307  } // for each entry in current local block row of matrix
308  COPY (Y_tlm, Y_cur);
309  }
310  });
311  }
312 
313 private:
314  typename std::decay<AlphaCoeffType>::type alpha_;
315  typename GraphType::row_map_type::const_type ptr_;
316  typename GraphType::entries_type::const_type ind_;
317  MatrixValuesType val_;
318  local_ordinal_type blockSize_;
319  InVecType X_;
320  typename std::decay<BetaCoeffType>::type beta_;
321  OutVecType Y_;
322  local_ordinal_type rowsPerTeam_;
323 };
324 #endif // 0
325 
326 template<class AlphaCoeffType,
327  class GraphType,
328  class MatrixValuesType,
329  class InVecType,
330  class BetaCoeffType,
331  class OutVecType>
332 class BcrsApplyNoTrans1VecFunctor {
333 private:
334  static_assert (Kokkos::Impl::is_view<MatrixValuesType>::value,
335  "MatrixValuesType must be a Kokkos::View.");
336  static_assert (Kokkos::Impl::is_view<OutVecType>::value,
337  "OutVecType must be a Kokkos::View.");
338  static_assert (Kokkos::Impl::is_view<InVecType>::value,
339  "InVecType must be a Kokkos::View.");
340  static_assert (std::is_same<MatrixValuesType,
341  typename MatrixValuesType::const_type>::value,
342  "MatrixValuesType must be a const Kokkos::View.");
343  static_assert (std::is_same<OutVecType,
344  typename OutVecType::non_const_type>::value,
345  "OutVecType must be a nonconst Kokkos::View.");
346  static_assert (std::is_same<InVecType, typename InVecType::const_type>::value,
347  "InVecType must be a const Kokkos::View.");
348  static_assert (static_cast<int> (MatrixValuesType::rank) == 1,
349  "MatrixValuesType must be a rank-1 Kokkos::View.");
350  static_assert (static_cast<int> (InVecType::rank) == 1,
351  "InVecType must be a rank-1 Kokkos::View.");
352  static_assert (static_cast<int> (OutVecType::rank) == 1,
353  "OutVecType must be a rank-1 Kokkos::View.");
354  typedef typename MatrixValuesType::non_const_value_type scalar_type;
355 
356 public:
357  typedef typename GraphType::device_type device_type;
358 
360  typedef typename std::remove_const<typename GraphType::data_type>::type
363  typedef Kokkos::RangePolicy<Kokkos::Schedule<Kokkos::Dynamic>,
364  typename device_type::execution_space,
365  local_ordinal_type> policy_type;
372  void setX (const InVecType& X) { X_ = X; }
373 
380  void setY (const OutVecType& Y) { Y_ = Y; }
381 
383  BcrsApplyNoTrans1VecFunctor (const typename std::decay<AlphaCoeffType>::type& alpha,
384  const GraphType& graph,
385  const MatrixValuesType& val,
386  const local_ordinal_type blockSize,
387  const InVecType& X,
388  const typename std::decay<BetaCoeffType>::type& beta,
389  const OutVecType& Y) :
390  alpha_ (alpha),
391  ptr_ (graph.row_map),
392  ind_ (graph.entries),
393  val_ (val),
394  blockSize_ (blockSize),
395  X_ (X),
396  beta_ (beta),
397  Y_ (Y)
398  {}
399 
400  KOKKOS_INLINE_FUNCTION void
401  operator () (const local_ordinal_type& lclRow) const
402  {
407  using Kokkos::Details::ArithTraits;
408  // I'm not writing 'using Kokkos::make_pair;' here, because that
409  // may break builds for users who make the mistake of putting
410  // 'using namespace std;' in the global namespace. Please don't
411  // ever do that! But just in case you do, I'll take this
412  // precaution.
413  using Kokkos::parallel_for;
414  using Kokkos::subview;
415  typedef typename decltype (ptr_)::non_const_value_type offset_type;
416  typedef Kokkos::View<typename MatrixValuesType::const_value_type**,
417  Kokkos::LayoutRight,
418  device_type,
419  Kokkos::MemoryTraits<Kokkos::Unmanaged> >
420  little_block_type;
421 
422  const offset_type Y_ptBeg = lclRow * blockSize_;
423  const offset_type Y_ptEnd = Y_ptBeg + blockSize_;
424  auto Y_cur = subview (Y_, ::Kokkos::make_pair (Y_ptBeg, Y_ptEnd));
425 
426  // This version of the code does not use temporary storage.
427  // Each thread writes to its own block of the target vector.
428  if (beta_ == ArithTraits<BetaCoeffType>::zero ()) {
429  FILL (Y_cur, ArithTraits<BetaCoeffType>::zero ());
430  }
431  else if (beta_ != ArithTraits<BetaCoeffType>::one ()) { // beta != 0 && beta != 1
432  SCAL (beta_, Y_cur);
433  }
434 
435  if (alpha_ != ArithTraits<AlphaCoeffType>::zero ()) {
436  const offset_type blkBeg = ptr_[lclRow];
437  const offset_type blkEnd = ptr_[lclRow+1];
438  // Precompute to save integer math in the inner loop.
439  const offset_type bs2 = blockSize_ * blockSize_;
440  for (offset_type absBlkOff = blkBeg; absBlkOff < blkEnd;
441  ++absBlkOff) {
442  little_block_type A_cur (val_.ptr_on_device () + absBlkOff * bs2,
443  blockSize_, blockSize_);
444  const offset_type X_blkCol = ind_[absBlkOff];
445  const offset_type X_ptBeg = X_blkCol * blockSize_;
446  const offset_type X_ptEnd = X_ptBeg + blockSize_;
447  auto X_cur = subview (X_, ::Kokkos::make_pair (X_ptBeg, X_ptEnd));
448 
449  GEMV (alpha_, A_cur, X_cur, Y_cur); // Y_cur += alpha*A_cur*X_cur
450  } // for each entry in current local block row of matrix
451  }
452  }
453 
454 private:
455  typename std::decay<AlphaCoeffType>::type alpha_;
456  typename GraphType::row_map_type::const_type ptr_;
457  typename GraphType::entries_type::const_type ind_;
458  MatrixValuesType val_;
459  local_ordinal_type blockSize_;
460  InVecType X_;
461  typename std::decay<BetaCoeffType>::type beta_;
462  OutVecType Y_;
463 };
464 
465 template<class AlphaCoeffType,
466  class GraphType,
467  class MatrixValuesType,
468  class InMultiVecType,
469  class BetaCoeffType,
470  class OutMultiVecType>
471 void
472 bcrsLocalApplyNoTrans (const AlphaCoeffType& alpha,
473  const GraphType& graph,
474  const MatrixValuesType& val,
475  const typename std::remove_const<typename GraphType::data_type>::type blockSize,
476  const InMultiVecType& X,
477  const BetaCoeffType& beta,
478  const OutMultiVecType& Y
479 #if 0
480  , const typename std::remove_const<typename GraphType::data_type>::type rowsPerTeam = 20
481 #endif // 0
482  )
483 {
484  static_assert (Kokkos::Impl::is_view<MatrixValuesType>::value,
485  "MatrixValuesType must be a Kokkos::View.");
486  static_assert (Kokkos::Impl::is_view<OutMultiVecType>::value,
487  "OutMultiVecType must be a Kokkos::View.");
488  static_assert (Kokkos::Impl::is_view<InMultiVecType>::value,
489  "InMultiVecType must be a Kokkos::View.");
490  static_assert (static_cast<int> (MatrixValuesType::rank) == 1,
491  "MatrixValuesType must be a rank-1 Kokkos::View.");
492  static_assert (static_cast<int> (OutMultiVecType::rank) == 2,
493  "OutMultiVecType must be a rank-2 Kokkos::View.");
494  static_assert (static_cast<int> (InMultiVecType::rank) == 2,
495  "InMultiVecType must be a rank-2 Kokkos::View.");
496 
497  typedef typename MatrixValuesType::const_type matrix_values_type;
498  typedef typename OutMultiVecType::non_const_type out_multivec_type;
499  typedef typename InMultiVecType::const_type in_multivec_type;
500  typedef typename std::decay<AlphaCoeffType>::type alpha_type;
501  typedef typename std::decay<BetaCoeffType>::type beta_type;
502  typedef typename std::remove_const<typename GraphType::data_type>::type LO;
503 
504  const LO numLocalMeshRows = graph.row_map.dimension_0 () == 0 ?
505  static_cast<LO> (0) :
506  static_cast<LO> (graph.row_map.dimension_0 () - 1);
507  const LO numVecs = Y.dimension_1 ();
508  if (numLocalMeshRows == 0 || numVecs == 0) {
509  return; // code below doesn't handle numVecs==0 correctly
510  }
511 
512  // These assignments avoid instantiating the functor extra times
513  // unnecessarily, e.g., for X const vs. nonconst. We only need the
514  // X const case, so only instantiate for that case.
515  in_multivec_type X_in = X;
516  out_multivec_type Y_out = Y;
517 
518  // The functor only knows how to handle one vector at a time, and it
519  // expects 1-D Views. Thus, we need to know the type of each column
520  // of X and Y.
521  typedef decltype (Kokkos::subview (X_in, Kokkos::ALL (), 0)) in_vec_type;
522  typedef decltype (Kokkos::subview (Y_out, Kokkos::ALL (), 0)) out_vec_type;
523 #if 0
524  typedef BcrsApplyNoTrans1VecTeamFunctor<alpha_type, GraphType,
525  matrix_values_type, in_vec_type, beta_type, out_vec_type> functor_type;
526 #else
527  typedef BcrsApplyNoTrans1VecFunctor<alpha_type, GraphType,
528  matrix_values_type, in_vec_type, beta_type, out_vec_type> functor_type;
529 #endif // 0
530  typedef typename functor_type::policy_type policy_type;
531 
532  auto X_0 = Kokkos::subview (X_in, Kokkos::ALL (), 0);
533  auto Y_0 = Kokkos::subview (Y_out, Kokkos::ALL (), 0);
534 #if 0
535  functor_type functor (alpha, graph, val, blockSize, X_0, beta, Y_0, rowsPerTeam);
536  const LO numTeams = functor.getNumTeams ();
537  policy_type policy (numTeams, Kokkos::AUTO ());
538  {
539  // KJ : hierarchy level of memory allocated e.g., cache (1),
540  // HBM (2), DDR (3), not used for now
541  const LO level = 1;
542  // KJ : for now provide two options for parallelizing (for vs. reduce)
543  const LO scratchSizePerTeam = functor.getScratchSizePerTeam (); // used for team parallel_red
544  const LO scratchSizePerThread = functor.getScratchSizePerThread (); // used for team parallel_for
545  policy =
546  policy.set_scratch_size (level,
547  Kokkos::PerTeam (scratchSizePerTeam),
548  Kokkos::PerThread (scratchSizePerThread));
549  }
550 #else
551  functor_type functor (alpha, graph, val, blockSize, X_0, beta, Y_0);
552  policy_type policy (0, numLocalMeshRows);
553 #endif // 0
554 
555  // Compute the first column of Y.
556  Kokkos::parallel_for (policy, functor);
557 
558  // Compute the remaining columns of Y.
559  for (LO j = 1; j < numVecs; ++j) {
560  auto X_j = Kokkos::subview (X_in, Kokkos::ALL (), j);
561  auto Y_j = Kokkos::subview (Y_out, Kokkos::ALL (), j);
562  functor.setX (X_j);
563  functor.setY (Y_j);
564  Kokkos::parallel_for (policy, functor);
565  }
566 }
567 
568 } // namespace Impl
569 
570 namespace { // (anonymous)
571 
572 // Implementation of BlockCrsMatrix::getLocalDiagCopy (non-deprecated
573 // version that takes two Kokkos::View arguments).
574 template<class Scalar, class LO, class GO, class Node>
575 class GetLocalDiagCopy {
576 public:
577  typedef typename Node::device_type device_type;
578  typedef size_t diag_offset_type;
579  typedef Kokkos::View<const size_t*, device_type,
580  Kokkos::MemoryUnmanaged> diag_offsets_type;
581  typedef typename ::Tpetra::CrsGraph<LO, GO, Node> global_graph_type;
582  typedef typename global_graph_type::local_graph_type local_graph_type;
583  typedef typename local_graph_type::row_map_type row_offsets_type;
584  typedef typename ::Tpetra::Experimental::BlockMultiVector<Scalar, LO, GO, Node>::impl_scalar_type IST;
585  typedef Kokkos::View<IST***, device_type, Kokkos::MemoryUnmanaged> diag_type;
586  typedef Kokkos::View<const IST*, device_type, Kokkos::MemoryUnmanaged> values_type;
587 
588  // Constructor
589  GetLocalDiagCopy (const diag_type& diag,
590  const values_type& val,
591  const diag_offsets_type& diagOffsets,
592  const row_offsets_type& ptr,
593  const LO blockSize) :
594  diag_ (diag),
595  diagOffsets_ (diagOffsets),
596  ptr_ (ptr),
597  blockSize_ (blockSize),
598  offsetPerBlock_ (blockSize_*blockSize_),
599  val_(val)
600  {}
601 
602  KOKKOS_INLINE_FUNCTION void
603  operator() (const LO& lclRowInd) const
604  {
605  using Kokkos::ALL;
606 
607  // Get row offset
608  const size_t absOffset = ptr_[lclRowInd];
609 
610  // Get offset relative to start of row
611  const size_t relOffset = diagOffsets_[lclRowInd];
612 
613  // Get the total offset
614  const size_t pointOffset = (absOffset+relOffset)*offsetPerBlock_;
615 
616  // Get a view of the block. BCRS currently uses LayoutRight
617  // regardless of the device.
618  typedef Kokkos::View<const IST**, Kokkos::LayoutRight,
619  device_type, Kokkos::MemoryTraits<Kokkos::Unmanaged> >
620  const_little_block_type;
621  const_little_block_type D_in (val_.ptr_on_device () + pointOffset,
622  blockSize_, blockSize_);
623  auto D_out = Kokkos::subview (diag_, lclRowInd, ALL (), ALL ());
624  COPY (D_in, D_out);
625  }
626 
627  private:
628  diag_type diag_;
629  diag_offsets_type diagOffsets_;
630  row_offsets_type ptr_;
631  LO blockSize_;
632  LO offsetPerBlock_;
633  values_type val_;
634  };
635 } // namespace (anonymous)
636 
637  template<class Scalar, class LO, class GO, class Node>
638  std::ostream&
639  BlockCrsMatrix<Scalar, LO, GO, Node>::
640  markLocalErrorAndGetStream ()
641  {
642  * (this->localError_) = true;
643  if ((*errs_).is_null ()) {
644  *errs_ = Teuchos::rcp (new std::ostringstream ());
645  }
646  return **errs_;
647  }
648 
649  template<class Scalar, class LO, class GO, class Node>
652  dist_object_type (Teuchos::rcp (new map_type ())), // nonnull, so DistObject doesn't throw
653  graph_ (Teuchos::rcp (new map_type ()), 0), // FIXME (mfh 16 May 2014) no empty ctor yet
654  blockSize_ (static_cast<LO> (0)),
655  X_colMap_ (new Teuchos::RCP<BMV> ()), // ptr to a null ptr
656  Y_rowMap_ (new Teuchos::RCP<BMV> ()), // ptr to a null ptr
657  pointImporter_ (new Teuchos::RCP<typename crs_graph_type::import_type> ()),
658  offsetPerBlock_ (0),
659  localError_ (new bool (false)),
660  errs_ (new Teuchos::RCP<std::ostringstream> ()) // ptr to a null ptr
661  {
662  }
663 
664  template<class Scalar, class LO, class GO, class Node>
667  const LO blockSize) :
668  dist_object_type (graph.getMap ()),
669  graph_ (graph),
670  rowMeshMap_ (* (graph.getRowMap ())),
671  blockSize_ (blockSize),
672  X_colMap_ (new Teuchos::RCP<BMV> ()), // ptr to a null ptr
673  Y_rowMap_ (new Teuchos::RCP<BMV> ()), // ptr to a null ptr
674  pointImporter_ (new Teuchos::RCP<typename crs_graph_type::import_type> ()),
675  offsetPerBlock_ (blockSize * blockSize),
676  localError_ (new bool (false)),
677  errs_ (new Teuchos::RCP<std::ostringstream> ()) // ptr to a null ptr
678  {
679  TEUCHOS_TEST_FOR_EXCEPTION(
680  ! graph_.isSorted (), std::invalid_argument, "Tpetra::Experimental::"
681  "BlockCrsMatrix constructor: The input CrsGraph does not have sorted "
682  "rows (isSorted() is false). This class assumes sorted rows.");
683 
684  graphRCP_ = Teuchos::rcpFromRef(graph_);
685 
686  // Trick to test whether LO is nonpositive, without a compiler
687  // warning in case LO is unsigned (which is generally a bad idea
688  // anyway). I don't promise that the trick works, but it
689  // generally does with gcc at least, in my experience.
690  const bool blockSizeIsNonpositive = (blockSize + 1 <= 1);
691  TEUCHOS_TEST_FOR_EXCEPTION(
692  blockSizeIsNonpositive, std::invalid_argument, "Tpetra::Experimental::"
693  "BlockCrsMatrix constructor: The input blockSize = " << blockSize <<
694  " <= 0. The block size must be positive.");
695 
696  domainPointMap_ = BMV::makePointMap (* (graph.getDomainMap ()), blockSize);
697  rangePointMap_ = BMV::makePointMap (* (graph.getRangeMap ()), blockSize);
698 
699  {
700  typedef typename crs_graph_type::local_graph_type::row_map_type row_map_type;
701  typedef typename row_map_type::HostMirror::non_const_type nc_host_row_map_type;
702 
703  row_map_type ptr_d = graph.getLocalGraph ().row_map;
704  nc_host_row_map_type ptr_h_nc = Kokkos::create_mirror_view (ptr_d);
705  Kokkos::deep_copy (ptr_h_nc, ptr_d);
706  ptrHost_ = ptr_h_nc;
707  }
708  {
709  typedef typename crs_graph_type::local_graph_type::entries_type entries_type;
710  typedef typename entries_type::HostMirror::non_const_type nc_host_entries_type;
711 
712  entries_type ind_d = graph.getLocalGraph ().entries;
713  nc_host_entries_type ind_h_nc = Kokkos::create_mirror_view (ind_d);
714  Kokkos::deep_copy (ind_h_nc, ind_d);
715  indHost_ = ind_h_nc;
716  }
717 
718  const auto numValEnt = graph.getNodeNumEntries () * offsetPerBlock ();
719  val_ = decltype (val_) ("val", numValEnt);
720  }
721 
722  template<class Scalar, class LO, class GO, class Node>
725  const map_type& domainPointMap,
726  const map_type& rangePointMap,
727  const LO blockSize) :
728  dist_object_type (graph.getMap ()),
729  graph_ (graph),
730  rowMeshMap_ (* (graph.getRowMap ())),
731  domainPointMap_ (domainPointMap),
732  rangePointMap_ (rangePointMap),
733  blockSize_ (blockSize),
734  X_colMap_ (new Teuchos::RCP<BMV> ()), // ptr to a null ptr
735  Y_rowMap_ (new Teuchos::RCP<BMV> ()), // ptr to a null ptr
736  pointImporter_ (new Teuchos::RCP<typename crs_graph_type::import_type> ()),
737  offsetPerBlock_ (blockSize * blockSize),
738  localError_ (new bool (false)),
739  errs_ (new Teuchos::RCP<std::ostringstream> ()) // ptr to a null ptr
740  {
741  TEUCHOS_TEST_FOR_EXCEPTION(
742  ! graph_.isSorted (), std::invalid_argument, "Tpetra::Experimental::"
743  "BlockCrsMatrix constructor: The input CrsGraph does not have sorted "
744  "rows (isSorted() is false). This class assumes sorted rows.");
745 
746  graphRCP_ = Teuchos::rcpFromRef(graph_);
747 
748  // Trick to test whether LO is nonpositive, without a compiler
749  // warning in case LO is unsigned (which is generally a bad idea
750  // anyway). I don't promise that the trick works, but it
751  // generally does with gcc at least, in my experience.
752  const bool blockSizeIsNonpositive = (blockSize + 1 <= 1);
753  TEUCHOS_TEST_FOR_EXCEPTION(
754  blockSizeIsNonpositive, std::invalid_argument, "Tpetra::Experimental::"
755  "BlockCrsMatrix constructor: The input blockSize = " << blockSize <<
756  " <= 0. The block size must be positive.");
757 
758  {
759  typedef typename crs_graph_type::local_graph_type::row_map_type row_map_type;
760  typedef typename row_map_type::HostMirror::non_const_type nc_host_row_map_type;
761 
762  row_map_type ptr_d = graph.getLocalGraph ().row_map;
763  nc_host_row_map_type ptr_h_nc = Kokkos::create_mirror_view (ptr_d);
764  Kokkos::deep_copy (ptr_h_nc, ptr_d);
765  ptrHost_ = ptr_h_nc;
766  }
767  {
768  typedef typename crs_graph_type::local_graph_type::entries_type entries_type;
769  typedef typename entries_type::HostMirror::non_const_type nc_host_entries_type;
770 
771  entries_type ind_d = graph.getLocalGraph ().entries;
772  nc_host_entries_type ind_h_nc = Kokkos::create_mirror_view (ind_d);
773  Kokkos::deep_copy (ind_h_nc, ind_d);
774  indHost_ = ind_h_nc;
775  }
776 
777  const auto numValEnt = graph.getNodeNumEntries () * offsetPerBlock ();
778  val_ = decltype (val_) ("val", numValEnt);
779  }
780 
781  template<class Scalar, class LO, class GO, class Node>
782  Teuchos::RCP<const typename BlockCrsMatrix<Scalar, LO, GO, Node>::map_type>
785  { // Copy constructor of map_type does a shallow copy.
786  // We're only returning by RCP for backwards compatibility.
787  return Teuchos::rcp (new map_type (domainPointMap_));
788  }
789 
790  template<class Scalar, class LO, class GO, class Node>
791  Teuchos::RCP<const typename BlockCrsMatrix<Scalar, LO, GO, Node>::map_type>
793  getRangeMap () const
794  { // Copy constructor of map_type does a shallow copy.
795  // We're only returning by RCP for backwards compatibility.
796  return Teuchos::rcp (new map_type (rangePointMap_));
797  }
798 
799  template<class Scalar, class LO, class GO, class Node>
800  Teuchos::RCP<const typename BlockCrsMatrix<Scalar, LO, GO, Node>::map_type>
802  getRowMap () const
803  {
804  return graph_.getRowMap();
805  }
806 
807  template<class Scalar, class LO, class GO, class Node>
808  Teuchos::RCP<const typename BlockCrsMatrix<Scalar, LO, GO, Node>::map_type>
810  getColMap () const
811  {
812  return graph_.getColMap();
813  }
814 
815  template<class Scalar, class LO, class GO, class Node>
819  {
820  return graph_.getGlobalNumRows();
821  }
822 
823  template<class Scalar, class LO, class GO, class Node>
824  size_t
827  {
828  return graph_.getNodeNumRows();
829  }
830 
831  template<class Scalar, class LO, class GO, class Node>
832  size_t
835  {
836  return graph_.getNodeMaxNumRowEntries();
837  }
838 
839  template<class Scalar, class LO, class GO, class Node>
840  void
842  apply (const mv_type& X,
843  mv_type& Y,
844  Teuchos::ETransp mode,
845  Scalar alpha,
846  Scalar beta) const
847  {
849  TEUCHOS_TEST_FOR_EXCEPTION(
850  mode != Teuchos::NO_TRANS && mode != Teuchos::TRANS && mode != Teuchos::CONJ_TRANS,
851  std::invalid_argument, "Tpetra::Experimental::BlockCrsMatrix::apply: "
852  "Invalid 'mode' argument. Valid values are Teuchos::NO_TRANS, "
853  "Teuchos::TRANS, and Teuchos::CONJ_TRANS.");
854 
855  BMV X_view;
856  BMV Y_view;
857  const LO blockSize = getBlockSize ();
858  try {
859  X_view = BMV (X, * (graph_.getDomainMap ()), blockSize);
860  Y_view = BMV (Y, * (graph_.getRangeMap ()), blockSize);
861  }
862  catch (std::invalid_argument& e) {
863  TEUCHOS_TEST_FOR_EXCEPTION(
864  true, std::invalid_argument, "Tpetra::Experimental::BlockCrsMatrix::"
865  "apply: Either the input MultiVector X or the output MultiVector Y "
866  "cannot be viewed as a BlockMultiVector, given this BlockCrsMatrix's "
867  "graph. BlockMultiVector's constructor threw the following exception: "
868  << e.what ());
869  }
870 
871  try {
872  // mfh 16 May 2014: Casting 'this' to nonconst is icky, but it's
873  // either that or mark fields of this class as 'mutable'. The
874  // problem is that applyBlock wants to do lazy initialization of
875  // temporary block multivectors.
876  const_cast<this_type*> (this)->applyBlock (X_view, Y_view, mode, alpha, beta);
877  } catch (std::invalid_argument& e) {
878  TEUCHOS_TEST_FOR_EXCEPTION(
879  true, std::invalid_argument, "Tpetra::Experimental::BlockCrsMatrix::"
880  "apply: The implementation method applyBlock complained about having "
881  "an invalid argument. It reported the following: " << e.what ());
882  } catch (std::logic_error& e) {
883  TEUCHOS_TEST_FOR_EXCEPTION(
884  true, std::invalid_argument, "Tpetra::Experimental::BlockCrsMatrix::"
885  "apply: The implementation method applyBlock complained about a "
886  "possible bug in its implementation. It reported the following: "
887  << e.what ());
888  } catch (std::exception& e) {
889  TEUCHOS_TEST_FOR_EXCEPTION(
890  true, std::invalid_argument, "Tpetra::Experimental::BlockCrsMatrix::"
891  "apply: The implementation method applyBlock threw an exception which "
892  "is neither std::invalid_argument nor std::logic_error, but is a "
893  "subclass of std::exception. It reported the following: "
894  << e.what ());
895  } catch (...) {
896  TEUCHOS_TEST_FOR_EXCEPTION(
897  true, std::logic_error, "Tpetra::Experimental::BlockCrsMatrix::"
898  "apply: The implementation method applyBlock threw an exception which "
899  "is not an instance of a subclass of std::exception. This probably "
900  "indicates a bug. Please report this to the Tpetra developers.");
901  }
902  }
903 
904  template<class Scalar, class LO, class GO, class Node>
905  void
909  Teuchos::ETransp mode,
910  const Scalar alpha,
911  const Scalar beta)
912  {
913  TEUCHOS_TEST_FOR_EXCEPTION(
914  X.getBlockSize () != Y.getBlockSize (), std::invalid_argument,
915  "Tpetra::Experimental::BlockCrsMatrix::applyBlock: "
916  "X and Y have different block sizes. X.getBlockSize() = "
917  << X.getBlockSize () << " != Y.getBlockSize() = "
918  << Y.getBlockSize () << ".");
919 
920  if (mode == Teuchos::NO_TRANS) {
921  applyBlockNoTrans (X, Y, alpha, beta);
922  } else if (mode == Teuchos::TRANS || mode == Teuchos::CONJ_TRANS) {
923  applyBlockTrans (X, Y, mode, alpha, beta);
924  } else {
925  TEUCHOS_TEST_FOR_EXCEPTION(
926  true, std::invalid_argument, "Tpetra::Experimental::BlockCrsMatrix::"
927  "applyBlock: Invalid 'mode' argument. Valid values are "
928  "Teuchos::NO_TRANS, Teuchos::TRANS, and Teuchos::CONJ_TRANS.");
929  }
930  }
931 
932  template<class Scalar, class LO, class GO, class Node>
933  void
935  setAllToScalar (const Scalar& alpha)
936  {
937 #ifdef HAVE_TPETRA_DEBUG
938  const char prefix[] = "Tpetra::Experimental::BlockCrsMatrix::setAllToScalar: ";
939 #endif // HAVE_TPETRA_DEBUG
940 
941  if (this->template need_sync<device_type> ()) {
942  // If we need to sync to device, then the data were last
943  // modified on host. In that case, we should again modify them
944  // on host.
945 #ifdef HAVE_TPETRA_DEBUG
946  TEUCHOS_TEST_FOR_EXCEPTION
947  (this->template need_sync<Kokkos::HostSpace> (), std::runtime_error,
948  prefix << "The matrix's values need sync on both device and host.");
949 #endif // HAVE_TPETRA_DEBUG
950  this->template modify<Kokkos::HostSpace> ();
951  Kokkos::deep_copy (this->template getValues<Kokkos::HostSpace> (), alpha);
952  }
953  else if (this->template need_sync<Kokkos::HostSpace> ()) {
954  // If we need to sync to host, then the data were last modified
955  // on device. In that case, we should again modify them on
956  // device.
957 #ifdef HAVE_TPETRA_DEBUG
958  TEUCHOS_TEST_FOR_EXCEPTION
959  (this->template need_sync<device_type> (), std::runtime_error,
960  prefix << "The matrix's values need sync on both host and device.");
961 #endif // HAVE_TPETRA_DEBUG
962  this->template modify<device_type> ();
963  Kokkos::deep_copy (this->template getValues<device_type> (), alpha);
964  }
965  else { // neither host nor device marked as modified, so modify on device
966  this->template modify<device_type> ();
967  Kokkos::deep_copy (this->template getValues<device_type> (), alpha);
968  }
969  }
970 
971  template<class Scalar, class LO, class GO, class Node>
972  LO
974  replaceLocalValues (const LO localRowInd,
975  const LO colInds[],
976  const Scalar vals[],
977  const LO numColInds) const
978  {
979 #ifdef HAVE_TPETRA_DEBUG
980  const char prefix[] =
981  "Tpetra::Experimental::BlockCrsMatrix::replaceLocalValues: ";
982 #endif // HAVE_TPETRA_DEBUG
983 
984  if (! rowMeshMap_.isNodeLocalElement (localRowInd)) {
985  // We modified no values, because the input local row index is
986  // invalid on the calling process. That may not be an error, if
987  // numColInds is zero anyway; it doesn't matter. This is the
988  // advantage of returning the number of valid indices.
989  return static_cast<LO> (0);
990  }
991  const impl_scalar_type* const vIn =
992  reinterpret_cast<const impl_scalar_type*> (vals);
993  const size_t absRowBlockOffset = ptrHost_[localRowInd];
994  const LO LINV = Teuchos::OrdinalTraits<LO>::invalid ();
995  const LO perBlockSize = this->offsetPerBlock ();
996  LO hint = 0; // Guess for the relative offset into the current row
997  LO pointOffset = 0; // Current offset into input values
998  LO validCount = 0; // number of valid column indices in colInds
999 
1000 #ifdef HAVE_TPETRA_DEBUG
1001  TEUCHOS_TEST_FOR_EXCEPTION
1002  (this->template need_sync<Kokkos::HostSpace> (), std::runtime_error,
1003  prefix << "The matrix's data were last modified on device, but have "
1004  "not been sync'd to host. Please sync to host (by calling "
1005  "sync<Kokkos::HostSpace>() on this matrix) before calling this "
1006  "method.");
1007 #endif // HAVE_TPETRA_DEBUG
1008 
1009  // NOTE (mfh 26 May 2016) OK to const_cast here, since the host
1010  // version of the data always exists (no lazy allocation for host
1011  // data).
1013  auto vals_host_out =
1014  const_cast<this_type*> (this)->template getValues<Kokkos::HostSpace> ();
1015  impl_scalar_type* vals_host_out_raw = vals_host_out.ptr_on_device ();
1016 
1017  for (LO k = 0; k < numColInds; ++k, pointOffset += perBlockSize) {
1018  const LO relBlockOffset =
1019  this->findRelOffsetOfColumnIndex (localRowInd, colInds[k], hint);
1020  if (relBlockOffset != LINV) {
1021  // mfh 21 Dec 2015: Here we encode the assumption that blocks
1022  // are stored contiguously, with no padding. "Contiguously"
1023  // means that all memory between the first and last entries
1024  // belongs to the block (no striding). "No padding" means
1025  // that getBlockSize() * getBlockSize() is exactly the number
1026  // of entries that the block uses. For another place where
1027  // this assumption is encoded, see sumIntoLocalValues.
1028 
1029  const size_t absBlockOffset = absRowBlockOffset + relBlockOffset;
1030  // little_block_type A_old =
1031  // getNonConstLocalBlockFromAbsOffset (absBlockOffset);
1032  impl_scalar_type* const A_old =
1033  vals_host_out_raw + absBlockOffset * perBlockSize;
1034  // const_little_block_type A_new =
1035  // getConstLocalBlockFromInput (vIn, pointOffset);
1036  const impl_scalar_type* const A_new = vIn + pointOffset;
1037  // COPY (A_new, A_old);
1038  for (LO i = 0; i < perBlockSize; ++i) {
1039  A_old[i] = A_new[i];
1040  }
1041  hint = relBlockOffset + 1;
1042  ++validCount;
1043  }
1044  }
1045  return validCount;
1046  }
1047 
1048  template <class Scalar, class LO, class GO, class Node>
1049  void
1051  getLocalDiagOffsets (const Kokkos::View<size_t*, device_type,
1052  Kokkos::MemoryUnmanaged>& offsets) const
1053  {
1054  graph_.getLocalDiagOffsets (offsets);
1055  }
1056 
1057  template <class Scalar, class LO, class GO, class Node>
1058  void TPETRA_DEPRECATED
1060  getLocalDiagOffsets (Teuchos::ArrayRCP<size_t>& offsets) const
1061  {
1062  // mfh 19 Mar 2016: We plan to deprecate the ArrayRCP version of
1063  // this method in CrsGraph too, so don't call it (otherwise build
1064  // warnings will show up and annoy users). Instead, copy results
1065  // in and out, if the memory space requires it.
1066 
1067  const size_t lclNumRows = graph_.getNodeNumRows ();
1068  if (static_cast<size_t> (offsets.size ()) < lclNumRows) {
1069  offsets.resize (lclNumRows);
1070  }
1071 
1072  // The input ArrayRCP must always be a host pointer. Thus, if
1073  // device_type::memory_space is Kokkos::HostSpace, it's OK for us
1074  // to write to that allocation directly as a Kokkos::View.
1075  typedef typename device_type::memory_space memory_space;
1076  if (std::is_same<memory_space, Kokkos::HostSpace>::value) {
1077  // It is always syntactically correct to assign a raw host
1078  // pointer to a device View, so this code will compile correctly
1079  // even if this branch never runs.
1080  typedef Kokkos::View<size_t*, device_type,
1081  Kokkos::MemoryUnmanaged> output_type;
1082  output_type offsetsOut (offsets.getRawPtr (), lclNumRows);
1083  graph_.getLocalDiagOffsets (offsetsOut);
1084  }
1085  else {
1086  Kokkos::View<size_t*, device_type> offsetsTmp ("diagOffsets", lclNumRows);
1087  graph_.getLocalDiagOffsets (offsetsTmp);
1088  typedef Kokkos::View<size_t*, Kokkos::HostSpace,
1089  Kokkos::MemoryUnmanaged> output_type;
1090  output_type offsetsOut (offsets.getRawPtr (), lclNumRows);
1091  Kokkos::deep_copy (offsetsOut, offsetsTmp);
1092  }
1093  }
1094 
1095  template <class Scalar, class LO, class GO, class Node>
1096  void
1100  const Kokkos::View<impl_scalar_type***, device_type,
1101  Kokkos::MemoryUnmanaged>& D_inv,
1102  const Scalar& omega,
1103  const ESweepDirection direction) const
1104  {
1105  using Kokkos::ALL;
1106  const impl_scalar_type zero =
1107  Kokkos::Details::ArithTraits<impl_scalar_type>::zero ();
1108  const impl_scalar_type one =
1109  Kokkos::Details::ArithTraits<impl_scalar_type>::one ();
1110  const LO numLocalMeshRows =
1111  static_cast<LO> (rowMeshMap_.getNodeNumElements ());
1112  const LO numVecs = static_cast<LO> (X.getNumVectors ());
1113 
1114  // If using (new) Kokkos, replace localMem with thread-local
1115  // memory. Note that for larger block sizes, this will affect the
1116  // two-level parallelization. Look to Stokhos for best practice
1117  // on making this fast for GPUs.
1118  const LO blockSize = getBlockSize ();
1119  Teuchos::Array<impl_scalar_type> localMem (blockSize);
1120  Teuchos::Array<impl_scalar_type> localMat (blockSize*blockSize);
1121  little_vec_type X_lcl (localMem.getRawPtr (), blockSize);
1122 
1123  // FIXME (mfh 12 Aug 2014) This probably won't work if LO is unsigned.
1124  LO rowBegin = 0, rowEnd = 0, rowStride = 0;
1125  if (direction == Forward) {
1126  rowBegin = 1;
1127  rowEnd = numLocalMeshRows+1;
1128  rowStride = 1;
1129  }
1130  else if (direction == Backward) {
1131  rowBegin = numLocalMeshRows;
1132  rowEnd = 0;
1133  rowStride = -1;
1134  }
1135  else if (direction == Symmetric) {
1136  this->localGaussSeidel (B, X, D_inv, omega, Forward);
1137  this->localGaussSeidel (B, X, D_inv, omega, Backward);
1138  return;
1139  }
1140 
1141  const Scalar one_minus_omega = Teuchos::ScalarTraits<Scalar>::one()-omega;
1142  const Scalar minus_omega = -omega;
1143 
1144  if (numVecs == 1) {
1145  for (LO lclRow = rowBegin; lclRow != rowEnd; lclRow += rowStride) {
1146  const LO actlRow = lclRow - 1;
1147 
1148  little_vec_type B_cur = B.getLocalBlock (actlRow, 0);
1149  COPY (B_cur, X_lcl);
1150  SCAL (omega, X_lcl);
1151 
1152  const size_t meshBeg = ptrHost_[actlRow];
1153  const size_t meshEnd = ptrHost_[actlRow+1];
1154  for (size_t absBlkOff = meshBeg; absBlkOff < meshEnd; ++absBlkOff) {
1155  const LO meshCol = indHost_[absBlkOff];
1156  const_little_block_type A_cur =
1157  getConstLocalBlockFromAbsOffset (absBlkOff);
1158  little_vec_type X_cur = X.getLocalBlock (meshCol, 0);
1159 
1160  // X_lcl += alpha*A_cur*X_cur
1161  const Scalar alpha = meshCol == actlRow ? one_minus_omega : minus_omega;
1162  //X_lcl.matvecUpdate (alpha, A_cur, X_cur);
1163  GEMV (alpha, A_cur, X_cur, X_lcl);
1164  } // for each entry in the current local row of the matrix
1165 
1166  // NOTE (mfh 20 Jan 2016) The two input Views here are
1167  // unmanaged already, so we don't have to take unmanaged
1168  // subviews first.
1169  auto D_lcl = Kokkos::subview (D_inv, actlRow, ALL (), ALL ());
1170  little_vec_type X_update = X.getLocalBlock (actlRow, 0);
1171  FILL (X_update, zero);
1172  GEMV (one, D_lcl, X_lcl, X_update); // overwrite X_update
1173  } // for each local row of the matrix
1174  }
1175  else {
1176  for (LO lclRow = rowBegin; lclRow != rowEnd; lclRow += rowStride) {
1177  for (LO j = 0; j < numVecs; ++j) {
1178  LO actlRow = lclRow-1;
1179 
1180  little_vec_type B_cur = B.getLocalBlock (actlRow, j);
1181  COPY (B_cur, X_lcl);
1182  SCAL (omega, X_lcl);
1183 
1184  const size_t meshBeg = ptrHost_[actlRow];
1185  const size_t meshEnd = ptrHost_[actlRow+1];
1186  for (size_t absBlkOff = meshBeg; absBlkOff < meshEnd; ++absBlkOff) {
1187  const LO meshCol = indHost_[absBlkOff];
1188  const_little_block_type A_cur =
1189  getConstLocalBlockFromAbsOffset (absBlkOff);
1190  little_vec_type X_cur = X.getLocalBlock (meshCol, j);
1191 
1192  // X_lcl += alpha*A_cur*X_cur
1193  const Scalar alpha = meshCol == actlRow ? one_minus_omega : minus_omega;
1194  GEMV (alpha, A_cur, X_cur, X_lcl);
1195  } // for each entry in the current local row of the matrx
1196 
1197  auto D_lcl = Kokkos::subview (D_inv, actlRow, ALL (), ALL ());
1198  auto X_update = X.getLocalBlock (actlRow, j);
1199  FILL (X_update, zero);
1200  GEMV (one, D_lcl, X_lcl, X_update); // overwrite X_update
1201  } // for each entry in the current local row of the matrix
1202  } // for each local row of the matrix
1203  }
1204  }
1205 
1206  template <class Scalar, class LO, class GO, class Node>
1207  void
1212  const Scalar& dampingFactor,
1213  const ESweepDirection direction,
1214  const int numSweeps,
1215  const bool zeroInitialGuess) const
1216  {
1217  // FIXME (mfh 12 Aug 2014) This method has entirely the wrong
1218  // interface for block Gauss-Seidel.
1219  TEUCHOS_TEST_FOR_EXCEPTION(
1220  true, std::logic_error, "Tpetra::Experimental::BlockCrsMatrix::"
1221  "gaussSeidelCopy: Not implemented.");
1222  }
1223 
1224  template <class Scalar, class LO, class GO, class Node>
1225  void
1230  const Teuchos::ArrayView<LO>& rowIndices,
1231  const Scalar& dampingFactor,
1232  const ESweepDirection direction,
1233  const int numSweeps,
1234  const bool zeroInitialGuess) const
1235  {
1236  // FIXME (mfh 12 Aug 2014) This method has entirely the wrong
1237  // interface for block Gauss-Seidel.
1238  TEUCHOS_TEST_FOR_EXCEPTION(
1239  true, std::logic_error, "Tpetra::Experimental::BlockCrsMatrix::"
1240  "reorderedGaussSeidelCopy: Not implemented.");
1241  }
1242 
1243  template <class Scalar, class LO, class GO, class Node>
1244  void TPETRA_DEPRECATED
1247  const Teuchos::ArrayView<const size_t>& offsets) const
1248  {
1249  using Teuchos::ArrayRCP;
1250  using Teuchos::ArrayView;
1251  const Scalar ZERO = Teuchos::ScalarTraits<Scalar>::zero ();
1252 
1253  const size_t myNumRows = rowMeshMap_.getNodeNumElements();
1254  const LO* columnIndices;
1255  Scalar* vals;
1256  LO numColumns;
1257  Teuchos::Array<LO> cols(1);
1258 
1259  // FIXME (mfh 12 Aug 2014) Should use a "little block" for this instead.
1260  Teuchos::Array<Scalar> zeroMat (blockSize_*blockSize_, ZERO);
1261  for (size_t i = 0; i < myNumRows; ++i) {
1262  cols[0] = i;
1263  if (offsets[i] == Teuchos::OrdinalTraits<size_t>::invalid ()) {
1264  diag.replaceLocalValues (i, cols.getRawPtr (), zeroMat.getRawPtr (), 1);
1265  }
1266  else {
1267  getLocalRowView (i, columnIndices, vals, numColumns);
1268  diag.replaceLocalValues (i, cols.getRawPtr(), &vals[offsets[i]*blockSize_*blockSize_], 1);
1269  }
1270  }
1271  }
1272 
1273 
1274  template <class Scalar, class LO, class GO, class Node>
1275  void
1278  Kokkos::MemoryUnmanaged>& diag,
1279  const Kokkos::View<const size_t*, device_type,
1280  Kokkos::MemoryUnmanaged>& offsets) const
1281  {
1282  using Kokkos::parallel_for;
1283  typedef typename device_type::execution_space execution_space;
1284  const char prefix[] = "Tpetra::BlockCrsMatrix::getLocalDiagCopy (2-arg): ";
1285 
1286  const LO lclNumMeshRows = static_cast<LO> (rowMeshMap_.getNodeNumElements ());
1287  const LO blockSize = this->getBlockSize ();
1288  TEUCHOS_TEST_FOR_EXCEPTION
1289  (static_cast<LO> (diag.dimension_0 ()) < lclNumMeshRows ||
1290  static_cast<LO> (diag.dimension_1 ()) < blockSize ||
1291  static_cast<LO> (diag.dimension_2 ()) < blockSize,
1292  std::invalid_argument, prefix <<
1293  "The input Kokkos::View is not big enough to hold all the data.");
1294  TEUCHOS_TEST_FOR_EXCEPTION
1295  (static_cast<LO> (offsets.size ()) < lclNumMeshRows, std::invalid_argument,
1296  prefix << "offsets.size() = " << offsets.size () << " < local number of "
1297  "diagonal blocks " << lclNumMeshRows << ".");
1298 
1299 #ifdef HAVE_TPETRA_DEBUG
1300  TEUCHOS_TEST_FOR_EXCEPTION
1301  (this->template need_sync<device_type> (), std::runtime_error,
1302  prefix << "The matrix's data were last modified on host, but have "
1303  "not been sync'd to device. Please sync to device (by calling "
1304  "sync<device_type>() on this matrix) before calling this method.");
1305 #endif // HAVE_TPETRA_DEBUG
1306 
1307  typedef Kokkos::RangePolicy<execution_space, LO> policy_type;
1308  typedef GetLocalDiagCopy<Scalar, LO, GO, Node> functor_type;
1309 
1310  // FIXME (mfh 26 May 2016) Not really OK to const_cast here, since
1311  // we reserve the right to do lazy allocation of device data. (We
1312  // don't plan to do lazy allocation for host data; the host
1313  // version of the data always exists.)
1315  auto vals_dev =
1316  const_cast<this_type*> (this)->template getValues<device_type> ();
1317 
1318  parallel_for (policy_type (0, lclNumMeshRows),
1319  functor_type (diag, vals_dev, offsets,
1320  graph_.getLocalGraph ().row_map, blockSize_));
1321  }
1322 
1323 
1324  template <class Scalar, class LO, class GO, class Node>
1325  void
1328  Kokkos::MemoryUnmanaged>& diag,
1329  const Teuchos::ArrayView<const size_t>& offsets) const
1330  {
1331  using Kokkos::ALL;
1332  using Kokkos::parallel_for;
1333  typedef typename Kokkos::View<impl_scalar_type***, device_type,
1334  Kokkos::MemoryUnmanaged>::HostMirror::execution_space host_exec_space;
1335 
1336  const LO lclNumMeshRows = static_cast<LO> (rowMeshMap_.getNodeNumElements ());
1337  const LO blockSize = this->getBlockSize ();
1338  TEUCHOS_TEST_FOR_EXCEPTION
1339  (static_cast<LO> (diag.dimension_0 ()) < lclNumMeshRows ||
1340  static_cast<LO> (diag.dimension_1 ()) < blockSize ||
1341  static_cast<LO> (diag.dimension_2 ()) < blockSize,
1342  std::invalid_argument, "Tpetra::BlockCrsMatrix::getLocalDiagCopy: "
1343  "The input Kokkos::View is not big enough to hold all the data.");
1344  TEUCHOS_TEST_FOR_EXCEPTION
1345  (static_cast<LO> (offsets.size ()) < lclNumMeshRows,
1346  std::invalid_argument, "Tpetra::BlockCrsMatrix::getLocalDiagCopy: "
1347  "offsets.size() = " << offsets.size () << " < local number of diagonal "
1348  "blocks " << lclNumMeshRows << ".");
1349 
1350  // mfh 12 Dec 2015: Use the host execution space, since we haven't
1351  // quite made everything work with CUDA yet.
1352  typedef Kokkos::RangePolicy<host_exec_space, LO> policy_type;
1353  parallel_for (policy_type (0, lclNumMeshRows), [=] (const LO& lclMeshRow) {
1354  auto D_in = this->getConstLocalBlockFromRelOffset (lclMeshRow, offsets[lclMeshRow]);
1355  auto D_out = Kokkos::subview (diag, lclMeshRow, ALL (), ALL ());
1356  COPY (D_in, D_out);
1357  });
1358  }
1359 
1360 
1361  template<class Scalar, class LO, class GO, class Node>
1362  LO
1364  absMaxLocalValues (const LO localRowInd,
1365  const LO colInds[],
1366  const Scalar vals[],
1367  const LO numColInds) const
1368  {
1369  if (! rowMeshMap_.isNodeLocalElement (localRowInd)) {
1370  // We modified no values, because the input local row index is
1371  // invalid on the calling process. That may not be an error, if
1372  // numColInds is zero anyway; it doesn't matter. This is the
1373  // advantage of returning the number of valid indices.
1374  return static_cast<LO> (0);
1375  }
1376  const impl_scalar_type* const vIn =
1377  reinterpret_cast<const impl_scalar_type*> (vals);
1378  const size_t absRowBlockOffset = ptrHost_[localRowInd];
1379  const LO LINV = Teuchos::OrdinalTraits<LO>::invalid ();
1380  const LO perBlockSize = this->offsetPerBlock ();
1381  LO hint = 0; // Guess for the relative offset into the current row
1382  LO pointOffset = 0; // Current offset into input values
1383  LO validCount = 0; // number of valid column indices in colInds
1384 
1385  for (LO k = 0; k < numColInds; ++k, pointOffset += perBlockSize) {
1386  const LO relBlockOffset =
1387  this->findRelOffsetOfColumnIndex (localRowInd, colInds[k], hint);
1388  if (relBlockOffset != LINV) {
1389  const size_t absBlockOffset = absRowBlockOffset + relBlockOffset;
1390  little_block_type A_old =
1391  getNonConstLocalBlockFromAbsOffset (absBlockOffset);
1392  const_little_block_type A_new =
1393  getConstLocalBlockFromInput (vIn, pointOffset);
1394 
1395  Impl::absMax (A_old, A_new);
1396  hint = relBlockOffset + 1;
1397  ++validCount;
1398  }
1399  }
1400  return validCount;
1401  }
1402 
1403 
1404  template<class Scalar, class LO, class GO, class Node>
1405  LO
1407  sumIntoLocalValues (const LO localRowInd,
1408  const LO colInds[],
1409  const Scalar vals[],
1410  const LO numColInds) const
1411  {
1412 #ifdef HAVE_TPETRA_DEBUG
1413  const char prefix[] =
1414  "Tpetra::Experimental::BlockCrsMatrix::sumIntoLocalValues: ";
1415 #endif // HAVE_TPETRA_DEBUG
1416 
1417  if (! rowMeshMap_.isNodeLocalElement (localRowInd)) {
1418  // We modified no values, because the input local row index is
1419  // invalid on the calling process. That may not be an error, if
1420  // numColInds is zero anyway; it doesn't matter. This is the
1421  // advantage of returning the number of valid indices.
1422  return static_cast<LO> (0);
1423  }
1424  //const impl_scalar_type ONE = static_cast<impl_scalar_type> (1.0);
1425  const impl_scalar_type* const vIn =
1426  reinterpret_cast<const impl_scalar_type*> (vals);
1427  const size_t absRowBlockOffset = ptrHost_[localRowInd];
1428  const LO LINV = Teuchos::OrdinalTraits<LO>::invalid ();
1429  const LO perBlockSize = this->offsetPerBlock ();
1430  LO hint = 0; // Guess for the relative offset into the current row
1431  LO pointOffset = 0; // Current offset into input values
1432  LO validCount = 0; // number of valid column indices in colInds
1433 
1434 #ifdef HAVE_TPETRA_DEBUG
1435  TEUCHOS_TEST_FOR_EXCEPTION
1436  (this->template need_sync<Kokkos::HostSpace> (), std::runtime_error,
1437  prefix << "The matrix's data were last modified on device, but have not "
1438  "been sync'd to host. Please sync to host (by calling "
1439  "sync<Kokkos::HostSpace>() on this matrix) before calling this method.");
1440 #endif // HAVE_TPETRA_DEBUG
1441 
1442  // NOTE (mfh 26 May 2016) OK to const_cast here, since the host
1443  // version of the data always exists (no lazy allocation for host
1444  // data).
1446  auto vals_host_out =
1447  const_cast<this_type*> (this)->template getValues<Kokkos::HostSpace> ();
1448  impl_scalar_type* vals_host_out_raw = vals_host_out.ptr_on_device ();
1449 
1450  for (LO k = 0; k < numColInds; ++k, pointOffset += perBlockSize) {
1451  const LO relBlockOffset =
1452  this->findRelOffsetOfColumnIndex (localRowInd, colInds[k], hint);
1453  if (relBlockOffset != LINV) {
1454  // mfh 21 Dec 2015: Here we encode the assumption that blocks
1455  // are stored contiguously, with no padding. "Contiguously"
1456  // means that all memory between the first and last entries
1457  // belongs to the block (no striding). "No padding" means
1458  // that getBlockSize() * getBlockSize() is exactly the number
1459  // of entries that the block uses. For another place where
1460  // this assumption is encoded, see replaceLocalValues.
1461 
1462  const size_t absBlockOffset = absRowBlockOffset + relBlockOffset;
1463  // little_block_type A_old =
1464  // getNonConstLocalBlockFromAbsOffset (absBlockOffset);
1465  impl_scalar_type* const A_old =
1466  vals_host_out_raw + absBlockOffset * perBlockSize;
1467  // const_little_block_type A_new =
1468  // getConstLocalBlockFromInput (vIn, pointOffset);
1469  const impl_scalar_type* const A_new = vIn + pointOffset;
1470  // AXPY (ONE, A_new, A_old);
1471  for (LO i = 0; i < perBlockSize; ++i) {
1472  A_old[i] += A_new[i];
1473  }
1474  hint = relBlockOffset + 1;
1475  ++validCount;
1476  }
1477  }
1478  return validCount;
1479  }
1480 
1481  template<class Scalar, class LO, class GO, class Node>
1482  LO
1484  getLocalRowView (const LO localRowInd,
1485  const LO*& colInds,
1486  Scalar*& vals,
1487  LO& numInds) const
1488  {
1489 #ifdef HAVE_TPETRA_DEBUG
1490  const char prefix[] =
1491  "Tpetra::Experimental::BlockCrsMatrix::getLocalRowView: ";
1492 #endif // HAVE_TPETRA_DEBUG
1493 
1494  if (! rowMeshMap_.isNodeLocalElement (localRowInd)) {
1495  colInds = NULL;
1496  vals = NULL;
1497  numInds = 0;
1498  return Teuchos::OrdinalTraits<LO>::invalid ();
1499  }
1500  else {
1501  const size_t absBlockOffsetStart = ptrHost_[localRowInd];
1502  colInds = indHost_.ptr_on_device () + absBlockOffsetStart;
1503 
1504 #ifdef HAVE_TPETRA_DEBUG
1505  TEUCHOS_TEST_FOR_EXCEPTION
1506  (this->template need_sync<Kokkos::HostSpace> (), std::runtime_error,
1507  prefix << "The matrix's data were last modified on device, but have "
1508  "not been sync'd to host. Please sync to host (by calling "
1509  "sync<Kokkos::HostSpace>() on this matrix) before calling this "
1510  "method.");
1511 #endif // HAVE_TPETRA_DEBUG
1512 
1513  // NOTE (mfh 26 May 2016) OK to const_cast here, since the host
1514  // version of the data always exists (no lazy allocation for host
1515  // data).
1517  auto vals_host_out =
1518  const_cast<this_type*> (this)->template getValues<Kokkos::HostSpace> ();
1519  impl_scalar_type* vals_host_out_raw = vals_host_out.ptr_on_device ();
1520  impl_scalar_type* const vOut = vals_host_out_raw +
1521  absBlockOffsetStart * offsetPerBlock ();
1522  vals = reinterpret_cast<Scalar*> (vOut);
1523 
1524  numInds = ptrHost_[localRowInd + 1] - absBlockOffsetStart;
1525  return 0; // indicates no error
1526  }
1527  }
1528 
1529  template<class Scalar, class LO, class GO, class Node>
1530  void
1532  getLocalRowCopy (LO LocalRow,
1533  const Teuchos::ArrayView<LO>& Indices,
1534  const Teuchos::ArrayView<Scalar>& Values,
1535  size_t &NumEntries) const
1536  {
1537  const LO *colInds;
1538  Scalar *vals;
1539  LO numInds;
1540  getLocalRowView(LocalRow,colInds,vals,numInds);
1541  if (numInds > Indices.size() || numInds*blockSize_*blockSize_ > Values.size()) {
1542  TEUCHOS_TEST_FOR_EXCEPTION(true, std::runtime_error,
1543  "Tpetra::BlockCrsMatrix::getLocalRowCopy : Column and/or values array is not large enough to hold "
1544  << numInds << " row entries");
1545  }
1546  for (LO i=0; i<numInds; ++i) {
1547  Indices[i] = colInds[i];
1548  }
1549  for (LO i=0; i<numInds*blockSize_*blockSize_; ++i) {
1550  Values[i] = vals[i];
1551  }
1552  NumEntries = numInds;
1553  }
1554 
1555  template<class Scalar, class LO, class GO, class Node>
1556  LO
1558  getLocalRowOffsets (const LO localRowInd,
1559  ptrdiff_t offsets[],
1560  const LO colInds[],
1561  const LO numColInds) const
1562  {
1563  if (! rowMeshMap_.isNodeLocalElement (localRowInd)) {
1564  // We got no offsets, because the input local row index is
1565  // invalid on the calling process. That may not be an error, if
1566  // numColInds is zero anyway; it doesn't matter. This is the
1567  // advantage of returning the number of valid indices.
1568  return static_cast<LO> (0);
1569  }
1570 
1571  const LO LINV = Teuchos::OrdinalTraits<LO>::invalid ();
1572  LO hint = 0; // Guess for the relative offset into the current row
1573  LO validCount = 0; // number of valid column indices in colInds
1574 
1575  for (LO k = 0; k < numColInds; ++k) {
1576  const LO relBlockOffset =
1577  this->findRelOffsetOfColumnIndex (localRowInd, colInds[k], hint);
1578  if (relBlockOffset != LINV) {
1579  offsets[k] = static_cast<ptrdiff_t> (relBlockOffset);
1580  hint = relBlockOffset + 1;
1581  ++validCount;
1582  }
1583  }
1584  return validCount;
1585  }
1586 
1587 
1588  template<class Scalar, class LO, class GO, class Node>
1589  LO
1591  replaceLocalValuesByOffsets (const LO localRowInd,
1592  const ptrdiff_t offsets[],
1593  const Scalar vals[],
1594  const LO numOffsets) const
1595  {
1596  if (! rowMeshMap_.isNodeLocalElement (localRowInd)) {
1597  // We modified no values, because the input local row index is
1598  // invalid on the calling process. That may not be an error, if
1599  // numColInds is zero anyway; it doesn't matter. This is the
1600  // advantage of returning the number of valid indices.
1601  return static_cast<LO> (0);
1602  }
1603  const impl_scalar_type* const vIn = reinterpret_cast<const impl_scalar_type*> (vals);
1604 
1605  const size_t absRowBlockOffset = ptrHost_[localRowInd];
1606  const size_t perBlockSize = static_cast<LO> (offsetPerBlock ());
1607  const size_t STINV = Teuchos::OrdinalTraits<size_t>::invalid ();
1608  size_t pointOffset = 0; // Current offset into input values
1609  LO validCount = 0; // number of valid offsets
1610 
1611  for (LO k = 0; k < numOffsets; ++k, pointOffset += perBlockSize) {
1612  const size_t relBlockOffset = offsets[k];
1613  if (relBlockOffset != STINV) {
1614  const size_t absBlockOffset = absRowBlockOffset + relBlockOffset;
1615  little_block_type A_old =
1616  getNonConstLocalBlockFromAbsOffset (absBlockOffset);
1617  const_little_block_type A_new =
1618  getConstLocalBlockFromInput (vIn, pointOffset);
1619  COPY (A_new, A_old);
1620  ++validCount;
1621  }
1622  }
1623  return validCount;
1624  }
1625 
1626 
1627  template<class Scalar, class LO, class GO, class Node>
1628  LO
1630  absMaxLocalValuesByOffsets (const LO localRowInd,
1631  const ptrdiff_t offsets[],
1632  const Scalar vals[],
1633  const LO numOffsets) const
1634  {
1635  if (! rowMeshMap_.isNodeLocalElement (localRowInd)) {
1636  // We modified no values, because the input local row index is
1637  // invalid on the calling process. That may not be an error, if
1638  // numColInds is zero anyway; it doesn't matter. This is the
1639  // advantage of returning the number of valid indices.
1640  return static_cast<LO> (0);
1641  }
1642  const impl_scalar_type* const vIn = reinterpret_cast<const impl_scalar_type*> (vals);
1643 
1644  const size_t absRowBlockOffset = ptrHost_[localRowInd];
1645  const size_t perBlockSize = static_cast<LO> (offsetPerBlock ());
1646  const size_t STINV = Teuchos::OrdinalTraits<size_t>::invalid ();
1647  size_t pointOffset = 0; // Current offset into input values
1648  LO validCount = 0; // number of valid offsets
1649 
1650  for (LO k = 0; k < numOffsets; ++k, pointOffset += perBlockSize) {
1651  const size_t relBlockOffset = offsets[k];
1652  if (relBlockOffset != STINV) {
1653  const size_t absBlockOffset = absRowBlockOffset + relBlockOffset;
1654  little_block_type A_old =
1655  getNonConstLocalBlockFromAbsOffset (absBlockOffset);
1656  const_little_block_type A_new =
1657  getConstLocalBlockFromInput (vIn, pointOffset);
1658  Impl::absMax (A_old, A_new);
1659  ++validCount;
1660  }
1661  }
1662  return validCount;
1663  }
1664 
1665 
1666  template<class Scalar, class LO, class GO, class Node>
1667  LO
1669  sumIntoLocalValuesByOffsets (const LO localRowInd,
1670  const ptrdiff_t offsets[],
1671  const Scalar vals[],
1672  const LO numOffsets) const
1673  {
1674  if (! rowMeshMap_.isNodeLocalElement (localRowInd)) {
1675  // We modified no values, because the input local row index is
1676  // invalid on the calling process. That may not be an error, if
1677  // numColInds is zero anyway; it doesn't matter. This is the
1678  // advantage of returning the number of valid indices.
1679  return static_cast<LO> (0);
1680  }
1681  const impl_scalar_type ONE = static_cast<impl_scalar_type> (1.0);
1682  const impl_scalar_type* const vIn = reinterpret_cast<const impl_scalar_type*> (vals);
1683 
1684  const size_t absRowBlockOffset = ptrHost_[localRowInd];
1685  const size_t perBlockSize = static_cast<LO> (offsetPerBlock ());
1686  const size_t STINV = Teuchos::OrdinalTraits<size_t>::invalid ();
1687  size_t pointOffset = 0; // Current offset into input values
1688  LO validCount = 0; // number of valid offsets
1689 
1690  for (LO k = 0; k < numOffsets; ++k, pointOffset += perBlockSize) {
1691  const size_t relBlockOffset = offsets[k];
1692  if (relBlockOffset != STINV) {
1693  const size_t absBlockOffset = absRowBlockOffset + relBlockOffset;
1694  little_block_type A_old =
1695  getNonConstLocalBlockFromAbsOffset (absBlockOffset);
1696  const_little_block_type A_new =
1697  getConstLocalBlockFromInput (vIn, pointOffset);
1698  //A_old.update (ONE, A_new);
1699  AXPY (ONE, A_new, A_old);
1700  ++validCount;
1701  }
1702  }
1703  return validCount;
1704  }
1705 
1706 
1707  template<class Scalar, class LO, class GO, class Node>
1708  size_t
1710  getNumEntriesInLocalRow (const LO localRowInd) const
1711  {
1712  const size_t numEntInGraph = graph_.getNumEntriesInLocalRow (localRowInd);
1713  if (numEntInGraph == Teuchos::OrdinalTraits<size_t>::invalid ()) {
1714  return static_cast<LO> (0); // the calling process doesn't have that row
1715  } else {
1716  return static_cast<LO> (numEntInGraph);
1717  }
1718  }
1719 
1720  template<class Scalar, class LO, class GO, class Node>
1721  void
1725  const Teuchos::ETransp mode,
1726  const Scalar alpha,
1727  const Scalar beta)
1728  {
1729  (void) X;
1730  (void) Y;
1731  (void) mode;
1732  (void) alpha;
1733  (void) beta;
1734 
1735  TEUCHOS_TEST_FOR_EXCEPTION(
1736  true, std::logic_error, "Tpetra::Experimental::BlockCrsMatrix::apply: "
1737  "transpose and conjugate transpose modes are not yet implemented.");
1738  }
1739 
1740  template<class Scalar, class LO, class GO, class Node>
1741  void
1742  BlockCrsMatrix<Scalar, LO, GO, Node>::
1743  applyBlockNoTrans (const BlockMultiVector<Scalar, LO, GO, Node>& X,
1744  BlockMultiVector<Scalar, LO, GO, Node>& Y,
1745  const Scalar alpha,
1746  const Scalar beta)
1747  {
1748  using Teuchos::RCP;
1749  using Teuchos::rcp;
1750  typedef Tpetra::Import<LO, GO, Node> import_type;
1751  typedef Tpetra::Export<LO, GO, Node> export_type;
1752  const Scalar zero = STS::zero ();
1753  const Scalar one = STS::one ();
1754  RCP<const import_type> import = graph_.getImporter ();
1755  // "export" is a reserved C++ keyword, so we can't use it.
1756  RCP<const export_type> theExport = graph_.getExporter ();
1757 
1758  // FIXME (mfh 20 May 2014) X.mv_ and Y.mv_ requires a friend
1759  // declaration, which is useful only for debugging.
1760  TEUCHOS_TEST_FOR_EXCEPTION(
1761  X.mv_.getCopyOrView () != Teuchos::View, std::invalid_argument,
1762  "Tpetra::Experimental::BlockCrsMatrix::applyBlockNoTrans: "
1763  "The input BlockMultiVector X has deep copy semantics, "
1764  "not view semantics (as it should).");
1765  TEUCHOS_TEST_FOR_EXCEPTION(
1766  Y.mv_.getCopyOrView () != Teuchos::View, std::invalid_argument,
1767  "Tpetra::Experimental::BlockCrsMatrix::applyBlockNoTrans: "
1768  "The output BlockMultiVector Y has deep copy semantics, "
1769  "not view semantics (as it should).");
1770 
1771  if (alpha == zero) {
1772  if (beta == zero) {
1773  Y.putScalar (zero); // replace Inf or NaN (BLAS rules)
1774  } else if (beta != one) {
1775  Y.scale (beta);
1776  }
1777  } else { // alpha != 0
1778  const BMV* X_colMap = NULL;
1779  if (import.is_null ()) {
1780  try {
1781  X_colMap = &X;
1782  } catch (std::exception& e) {
1783  TEUCHOS_TEST_FOR_EXCEPTION(
1784  true, std::logic_error, "Tpetra::Experimental::BlockCrsMatrix::"
1785  "applyBlockNoTrans:" << std::endl << "Tpetra::MultiVector::"
1786  "operator= threw an exception: " << std::endl << e.what ());
1787  }
1788  } else {
1789  // X_colMap_ is a pointer to a pointer to BMV. Ditto for
1790  // Y_rowMap_ below. This lets us do lazy initialization
1791  // correctly with view semantics of BlockCrsMatrix. All views
1792  // of this BlockCrsMatrix have the same outer pointer. That
1793  // way, we can set the inner pointer in one view, and all
1794  // other views will see it.
1795  if ((*X_colMap_).is_null () ||
1796  (**X_colMap_).getNumVectors () != X.getNumVectors () ||
1797  (**X_colMap_).getBlockSize () != X.getBlockSize ()) {
1798  *X_colMap_ = rcp (new BMV (* (graph_.getColMap ()), getBlockSize (),
1799  static_cast<LO> (X.getNumVectors ())));
1800  }
1801 #ifdef HAVE_TPETRA_BCRS_DO_POINT_IMPORT
1802  if (pointImporter_->is_null ()) {
1803  // The Import ctor needs RCPs. Map's copy ctor does a shallow copy, so
1804  // these are small operations.
1805  const auto domainPointMap = rcp (new typename BMV::map_type (domainPointMap_));
1806  const auto colPointMap = rcp (new typename BMV::map_type (
1807  BMV::makePointMap (*graph_.getColMap(),
1808  blockSize_)));
1809  *pointImporter_ = rcp (new typename crs_graph_type::import_type (
1810  domainPointMap, colPointMap));
1811  }
1812  (*X_colMap_)->getMultiVectorView().doImport (X.getMultiVectorView (),
1813  **pointImporter_,
1814  Tpetra::REPLACE);
1815 #else
1816  (**X_colMap_).doImport (X, *import, Tpetra::REPLACE);
1817 #endif
1818  try {
1819  X_colMap = &(**X_colMap_);
1820  } catch (std::exception& e) {
1821  TEUCHOS_TEST_FOR_EXCEPTION(
1822  true, std::logic_error, "Tpetra::Experimental::BlockCrsMatrix::"
1823  "applyBlockNoTrans:" << std::endl << "Tpetra::MultiVector::"
1824  "operator= threw an exception: " << std::endl << e.what ());
1825  }
1826  }
1827 
1828  BMV* Y_rowMap = NULL;
1829  if (theExport.is_null ()) {
1830  Y_rowMap = &Y;
1831  } else if ((*Y_rowMap_).is_null () ||
1832  (**Y_rowMap_).getNumVectors () != Y.getNumVectors () ||
1833  (**Y_rowMap_).getBlockSize () != Y.getBlockSize ()) {
1834  *Y_rowMap_ = rcp (new BMV (* (graph_.getRowMap ()), getBlockSize (),
1835  static_cast<LO> (X.getNumVectors ())));
1836  try {
1837  Y_rowMap = &(**Y_rowMap_);
1838  } catch (std::exception& e) {
1839  TEUCHOS_TEST_FOR_EXCEPTION(
1840  true, std::logic_error, "Tpetra::Experimental::BlockCrsMatrix::"
1841  "applyBlockNoTrans:" << std::endl << "Tpetra::MultiVector::"
1842  "operator= threw an exception: " << std::endl << e.what ());
1843  }
1844  }
1845 
1846  try {
1847  localApplyBlockNoTrans (*X_colMap, *Y_rowMap, alpha, beta);
1848  }
1849  catch (std::exception& e) {
1850  TEUCHOS_TEST_FOR_EXCEPTION
1851  (true, std::runtime_error, "Tpetra::Experimental::BlockCrsMatrix::"
1852  "applyBlockNoTrans: localApplyBlockNoTrans threw an exception: "
1853  << e.what ());
1854  }
1855  catch (...) {
1856  TEUCHOS_TEST_FOR_EXCEPTION
1857  (true, std::runtime_error, "Tpetra::Experimental::BlockCrsMatrix::"
1858  "applyBlockNoTrans: localApplyBlockNoTrans threw some exception "
1859  "that is not a subclass of std::exception.");
1860  }
1861 
1862  if (! theExport.is_null ()) {
1863  Y.doExport (*Y_rowMap, *theExport, Tpetra::REPLACE);
1864  }
1865  }
1866  }
1867 
1868  template<class Scalar, class LO, class GO, class Node>
1869  void
1870  BlockCrsMatrix<Scalar, LO, GO, Node>::
1871  localApplyBlockNoTrans (const BlockMultiVector<Scalar, LO, GO, Node>& X,
1872  BlockMultiVector<Scalar, LO, GO, Node>& Y,
1873  const Scalar alpha,
1874  const Scalar beta)
1875  {
1876  using Tpetra::Experimental::Impl::bcrsLocalApplyNoTrans;
1877 
1878  const impl_scalar_type alpha_impl = alpha;
1879  const auto graph = this->graph_.getLocalGraph ();
1880  const impl_scalar_type beta_impl = beta;
1881  const LO blockSize = this->getBlockSize ();
1882 
1883  auto X_mv = X.getMultiVectorView ();
1884  auto Y_mv = Y.getMultiVectorView ();
1885  Y_mv.template modify<device_type> ();
1886 
1887  auto X_lcl = X_mv.template getLocalView<device_type> ();
1888  auto Y_lcl = Y_mv.template getLocalView<device_type> ();
1889  auto val = this->val_.template view<device_type> ();
1890 
1891  bcrsLocalApplyNoTrans (alpha_impl, graph, val, blockSize, X_lcl,
1892  beta_impl, Y_lcl);
1893  }
1894 
1895  template<class Scalar, class LO, class GO, class Node>
1896  LO
1897  BlockCrsMatrix<Scalar, LO, GO, Node>::
1898  findRelOffsetOfColumnIndex (const LO localRowIndex,
1899  const LO colIndexToFind,
1900  const LO hint) const
1901  {
1902  const size_t absStartOffset = ptrHost_[localRowIndex];
1903  const size_t absEndOffset = ptrHost_[localRowIndex+1];
1904  const LO numEntriesInRow = static_cast<LO> (absEndOffset - absStartOffset);
1905  // Amortize pointer arithmetic over the search loop.
1906  const LO* const curInd = indHost_.ptr_on_device () + absStartOffset;
1907 
1908  // If the hint was correct, then the hint is the offset to return.
1909  if (hint < numEntriesInRow && curInd[hint] == colIndexToFind) {
1910  // Always return the offset relative to the current row.
1911  return hint;
1912  }
1913 
1914  // The hint was wrong, so we must search for the given column
1915  // index in the column indices for the given row.
1916  LO relOffset = Teuchos::OrdinalTraits<LO>::invalid ();
1917 
1918  // We require that the graph have sorted rows. However, binary
1919  // search only pays if the current row is longer than a certain
1920  // amount. We set this to 32, but you might want to tune this.
1921  const LO maxNumEntriesForLinearSearch = 32;
1922  if (numEntriesInRow > maxNumEntriesForLinearSearch) {
1923  // Use binary search. It would probably be better for us to
1924  // roll this loop by hand. If we wrote it right, a smart
1925  // compiler could perhaps use conditional loads and avoid
1926  // branches (according to Jed Brown on May 2014).
1927  const LO* beg = curInd;
1928  const LO* end = curInd + numEntriesInRow;
1929  std::pair<const LO*, const LO*> p =
1930  std::equal_range (beg, end, colIndexToFind);
1931  if (p.first != p.second) {
1932  // offset is relative to the current row
1933  relOffset = static_cast<LO> (p.first - beg);
1934  }
1935  }
1936  else { // use linear search
1937  for (LO k = 0; k < numEntriesInRow; ++k) {
1938  if (colIndexToFind == curInd[k]) {
1939  relOffset = k; // offset is relative to the current row
1940  break;
1941  }
1942  }
1943  }
1944 
1945  return relOffset;
1946  }
1947 
1948  template<class Scalar, class LO, class GO, class Node>
1949  LO
1950  BlockCrsMatrix<Scalar, LO, GO, Node>::
1951  offsetPerBlock () const
1952  {
1953  return offsetPerBlock_;
1954  }
1955 
1956  template<class Scalar, class LO, class GO, class Node>
1958  BlockCrsMatrix<Scalar, LO, GO, Node>::
1959  getConstLocalBlockFromInput (const impl_scalar_type* val,
1960  const size_t pointOffset) const
1961  {
1962  // Row major blocks
1963  const LO rowStride = blockSize_;
1964  return const_little_block_type (val + pointOffset, blockSize_, rowStride);
1965  }
1966 
1967  template<class Scalar, class LO, class GO, class Node>
1969  BlockCrsMatrix<Scalar, LO, GO, Node>::
1970  getNonConstLocalBlockFromInput (impl_scalar_type* val,
1971  const size_t pointOffset) const
1972  {
1973  // Row major blocks
1974  const LO rowStride = blockSize_;
1975  return little_block_type (val + pointOffset, blockSize_, rowStride);
1976  }
1977 
1978  template<class Scalar, class LO, class GO, class Node>
1980  BlockCrsMatrix<Scalar, LO, GO, Node>::
1981  getConstLocalBlockFromAbsOffset (const size_t absBlockOffset) const
1982  {
1983 #ifdef HAVE_TPETRA_DEBUG
1984  const char prefix[] =
1985  "Tpetra::Experimental::BlockCrsMatrix::getConstLocalBlockFromAbsOffset: ";
1986 #endif // HAVE_TPETRA_DEBUG
1987 
1988  if (absBlockOffset >= ptrHost_[rowMeshMap_.getNodeNumElements ()]) {
1989  // An empty block signifies an error. We don't expect to see
1990  // this error in correct code, but it's helpful for avoiding
1991  // memory corruption in case there is a bug.
1992  return const_little_block_type ();
1993  }
1994  else {
1995 #ifdef HAVE_TPETRA_DEBUG
1996  TEUCHOS_TEST_FOR_EXCEPTION
1997  (this->template need_sync<Kokkos::HostSpace> (), std::runtime_error,
1998  prefix << "The matrix's data were last modified on device, but have "
1999  "not been sync'd to host. Please sync to host (by calling "
2000  "sync<Kokkos::HostSpace>() on this matrix) before calling this "
2001  "method.");
2002 #endif // HAVE_TPETRA_DEBUG
2003  const size_t absPointOffset = absBlockOffset * offsetPerBlock ();
2004 
2005  // NOTE (mfh 26 May 2016) OK to const_cast here, since the host
2006  // version of the data always exists (no lazy allocation for host
2007  // data).
2008  typedef BlockCrsMatrix<Scalar, LO, GO, Node> this_type;
2009  auto vals_host =
2010  const_cast<this_type*> (this)->template getValues<Kokkos::HostSpace> ();
2011  const impl_scalar_type* vals_host_raw = vals_host.ptr_on_device ();
2012 
2013  return getConstLocalBlockFromInput (vals_host_raw, absPointOffset);
2014  }
2015  }
2016 
2017  template<class Scalar, class LO, class GO, class Node>
2019  BlockCrsMatrix<Scalar, LO, GO, Node>::
2020  getConstLocalBlockFromRelOffset (const LO lclMeshRow,
2021  const size_t relMeshOffset) const
2022  {
2023  typedef impl_scalar_type IST;
2024 
2025  const LO* lclColInds = NULL;
2026  Scalar* lclVals = NULL;
2027  LO numEnt = 0;
2028 
2029  LO err = this->getLocalRowView (lclMeshRow, lclColInds, lclVals, numEnt);
2030  if (err != 0) {
2031  // An empty block signifies an error. We don't expect to see
2032  // this error in correct code, but it's helpful for avoiding
2033  // memory corruption in case there is a bug.
2034  return const_little_block_type ();
2035  }
2036  else {
2037  const size_t relPointOffset = relMeshOffset * this->offsetPerBlock ();
2038  IST* lclValsImpl = reinterpret_cast<IST*> (lclVals);
2039  return this->getConstLocalBlockFromInput (const_cast<const IST*> (lclValsImpl),
2040  relPointOffset);
2041  }
2042  }
2043 
2044  template<class Scalar, class LO, class GO, class Node>
2046  BlockCrsMatrix<Scalar, LO, GO, Node>::
2047  getNonConstLocalBlockFromAbsOffset (const size_t absBlockOffset) const
2048  {
2049 #ifdef HAVE_TPETRA_DEBUG
2050  const char prefix[] =
2051  "Tpetra::Experimental::BlockCrsMatrix::getNonConstLocalBlockFromAbsOffset: ";
2052 #endif // HAVE_TPETRA_DEBUG
2053 
2054  if (absBlockOffset >= ptrHost_[rowMeshMap_.getNodeNumElements ()]) {
2055  // An empty block signifies an error. We don't expect to see
2056  // this error in correct code, but it's helpful for avoiding
2057  // memory corruption in case there is a bug.
2058  return little_block_type ();
2059  }
2060  else {
2061  const size_t absPointOffset = absBlockOffset * offsetPerBlock ();
2062 #ifdef HAVE_TPETRA_DEBUG
2063  TEUCHOS_TEST_FOR_EXCEPTION
2064  (this->template need_sync<Kokkos::HostSpace> (), std::runtime_error,
2065  prefix << "The matrix's data were last modified on device, but have "
2066  "not been sync'd to host. Please sync to host (by calling "
2067  "sync<Kokkos::HostSpace>() on this matrix) before calling this "
2068  "method.");
2069 #endif // HAVE_TPETRA_DEBUG
2070  // NOTE (mfh 26 May 2016) OK to const_cast here, since the host
2071  // version of the data always exists (no lazy allocation for host
2072  // data).
2073  typedef BlockCrsMatrix<Scalar, LO, GO, Node> this_type;
2074  auto vals_host =
2075  const_cast<this_type*> (this)->template getValues<Kokkos::HostSpace> ();
2076  impl_scalar_type* vals_host_raw = vals_host.ptr_on_device ();
2077  return getNonConstLocalBlockFromInput (vals_host_raw, absPointOffset);
2078  }
2079  }
2080 
2081  template<class Scalar, class LO, class GO, class Node>
2083  BlockCrsMatrix<Scalar, LO, GO, Node>::
2084  getLocalBlock (const LO localRowInd, const LO localColInd) const
2085  {
2086  const size_t absRowBlockOffset = ptrHost_[localRowInd];
2087  const LO relBlockOffset =
2088  this->findRelOffsetOfColumnIndex (localRowInd, localColInd);
2089 
2090  if (relBlockOffset != Teuchos::OrdinalTraits<LO>::invalid ()) {
2091  const size_t absBlockOffset = absRowBlockOffset + relBlockOffset;
2092  return getNonConstLocalBlockFromAbsOffset (absBlockOffset);
2093  }
2094  else {
2095  return little_block_type ();
2096  }
2097  }
2098 
2099  // template<class Scalar, class LO, class GO, class Node>
2100  // void
2101  // BlockCrsMatrix<Scalar, LO, GO, Node>::
2102  // clearLocalErrorStateAndStream ()
2103  // {
2104  // typedef BlockCrsMatrix<Scalar, LO, GO, Node> this_type;
2105  // * (const_cast<this_type*> (this)->localError_) = false;
2106  // *errs_ = Teuchos::null;
2107  // }
2108 
2109  template<class Scalar, class LO, class GO, class Node>
2110  bool
2113  {
2114  using std::endl;
2116  const this_type* src = dynamic_cast<const this_type* > (&source);
2117 
2118  if (src == NULL) {
2119  std::ostream& err = markLocalErrorAndGetStream ();
2120  err << "checkSizes: The source object of the Import or Export "
2121  "must be a BlockCrsMatrix with the same template parameters as the "
2122  "target object." << endl;
2123  }
2124  else {
2125  // Use a string of ifs, not if-elseifs, because we want to know
2126  // all the errors.
2127  if (src->getBlockSize () != this->getBlockSize ()) {
2128  std::ostream& err = markLocalErrorAndGetStream ();
2129  err << "checkSizes: The source and target objects of the Import or "
2130  << "Export must have the same block sizes. The source's block "
2131  << "size = " << src->getBlockSize () << " != the target's block "
2132  << "size = " << this->getBlockSize () << "." << endl;
2133  }
2134  if (! src->graph_.isFillComplete ()) {
2135  std::ostream& err = markLocalErrorAndGetStream ();
2136  err << "checkSizes: The source object of the Import or Export is "
2137  "not fill complete. Both source and target objects must be fill "
2138  "complete." << endl;
2139  }
2140  if (! this->graph_.isFillComplete ()) {
2141  std::ostream& err = markLocalErrorAndGetStream ();
2142  err << "checkSizes: The target object of the Import or Export is "
2143  "not fill complete. Both source and target objects must be fill "
2144  "complete." << endl;
2145  }
2146  if (src->graph_.getColMap ().is_null ()) {
2147  std::ostream& err = markLocalErrorAndGetStream ();
2148  err << "checkSizes: The source object of the Import or Export does "
2149  "not have a column Map. Both source and target objects must have "
2150  "column Maps." << endl;
2151  }
2152  if (this->graph_.getColMap ().is_null ()) {
2153  std::ostream& err = markLocalErrorAndGetStream ();
2154  err << "checkSizes: The target object of the Import or Export does "
2155  "not have a column Map. Both source and target objects must have "
2156  "column Maps." << endl;
2157  }
2158  }
2159 
2160  return ! (* (this->localError_));
2161  }
2162 
2163  template<class Scalar, class LO, class GO, class Node>
2164  void
2167  size_t numSameIDs,
2168  const Teuchos::ArrayView<const LO>& permuteToLIDs,
2169  const Teuchos::ArrayView<const LO>& permuteFromLIDs)
2170  {
2171  using std::endl;
2173  const bool debug = false;
2174 
2175  if (debug) {
2176  std::ostringstream os;
2177  const int myRank = this->graph_.getRowMap ()->getComm ()->getRank ();
2178  os << "Proc " << myRank << ": copyAndPermute: "
2179  << "numSameIDs: " << numSameIDs
2180  << ", permuteToLIDs.size(): " << permuteToLIDs.size ()
2181  << ", permuteFromLIDs.size(): " << permuteFromLIDs.size ()
2182  << endl;
2183  std::cerr << os.str ();
2184  }
2185 
2186  // There's no communication in this method, so it's safe just to
2187  // return on error.
2188 
2189  if (* (this->localError_)) {
2190  std::ostream& err = this->markLocalErrorAndGetStream ();
2191  err << "copyAndPermute: The target object of the Import or Export is "
2192  "already in an error state." << endl;
2193  return;
2194  }
2195  if (permuteToLIDs.size () != permuteFromLIDs.size ()) {
2196  std::ostream& err = this->markLocalErrorAndGetStream ();
2197  err << "copyAndPermute: permuteToLIDs.size() = " << permuteToLIDs.size ()
2198  << " != permuteFromLIDs.size() = " << permuteFromLIDs.size () << "."
2199  << endl;
2200  return;
2201  }
2202 
2203  const this_type* src = dynamic_cast<const this_type* > (&source);
2204  if (src == NULL) {
2205  std::ostream& err = this->markLocalErrorAndGetStream ();
2206  err << "copyAndPermute: The source object of the Import or Export is "
2207  "either not a BlockCrsMatrix, or does not have the right template "
2208  "parameters. checkSizes() should have caught this. "
2209  "Please report this bug to the Tpetra developers." << endl;
2210  return;
2211  }
2212  if (* (src->localError_)) {
2213  std::ostream& err = this->markLocalErrorAndGetStream ();
2214  err << "copyAndPermute: The source object of the Import or Export is "
2215  "already in an error state." << endl;
2216  return;
2217  }
2218 
2219  bool lclErr = false;
2220 #ifdef HAVE_TPETRA_DEBUG
2221  std::set<LO> invalidSrcCopyRows;
2222  std::set<LO> invalidDstCopyRows;
2223  std::set<LO> invalidDstCopyCols;
2224  std::set<LO> invalidDstPermuteCols;
2225  std::set<LO> invalidPermuteFromRows;
2226 #endif // HAVE_TPETRA_DEBUG
2227 
2228  // Copy the initial sequence of rows that are the same.
2229  //
2230  // The two graphs might have different column Maps, so we need to
2231  // do this using global column indices. This is purely local, so
2232  // we only need to check for local sameness of the two column
2233  // Maps.
2234 
2235 #ifdef HAVE_TPETRA_DEBUG
2236  const map_type& srcRowMap = * (src->graph_.getRowMap ());
2237 #endif // HAVE_TPETRA_DEBUG
2238  const map_type& dstRowMap = * (this->graph_.getRowMap ());
2239  const map_type& srcColMap = * (src->graph_.getColMap ());
2240  const map_type& dstColMap = * (this->graph_.getColMap ());
2241  const bool canUseLocalColumnIndices = srcColMap.locallySameAs (dstColMap);
2242  const size_t numPermute = static_cast<size_t> (permuteFromLIDs.size ());
2243 
2244  if (debug) {
2245  std::ostringstream os;
2246  const int myRank = this->graph_.getRowMap ()->getComm ()->getRank ();
2247  os << "Proc " << myRank << ": copyAndPermute: "
2248  << "canUseLocalColumnIndices: "
2249  << (canUseLocalColumnIndices ? "true" : "false")
2250  << ", numPermute: " << numPermute
2251  << endl;
2252  std::cerr << os.str ();
2253  }
2254 
2255  if (canUseLocalColumnIndices) {
2256  // Copy local rows that are the "same" in both source and target.
2257  for (LO localRow = 0; localRow < static_cast<LO> (numSameIDs); ++localRow) {
2258 #ifdef HAVE_TPETRA_DEBUG
2259  if (! srcRowMap.isNodeLocalElement (localRow)) {
2260  lclErr = true;
2261  invalidSrcCopyRows.insert (localRow);
2262  continue; // skip invalid rows
2263  }
2264 #endif // HAVE_TPETRA_DEBUG
2265 
2266  const LO* lclSrcCols;
2267  Scalar* vals;
2268  LO numEntries;
2269  // If this call fails, that means the mesh row local index is
2270  // invalid. That means the Import or Export is invalid somehow.
2271  LO err = src->getLocalRowView (localRow, lclSrcCols, vals, numEntries);
2272  if (err != 0) {
2273  lclErr = true;
2274 #ifdef HAVE_TPETRA_DEBUG
2275  (void) invalidSrcCopyRows.insert (localRow);
2276 #endif // HAVE_TPETRA_DEBUG
2277  }
2278  else {
2279  err = this->replaceLocalValues (localRow, lclSrcCols, vals, numEntries);
2280  if (err != numEntries) {
2281  lclErr = true;
2282  if (! dstRowMap.isNodeLocalElement (localRow)) {
2283 #ifdef HAVE_TPETRA_DEBUG
2284  invalidDstCopyRows.insert (localRow);
2285 #endif // HAVE_TPETRA_DEBUG
2286  }
2287  else {
2288  // Once there's an error, there's no sense in saving
2289  // time, so we check whether the column indices were
2290  // invalid. However, only remember which ones were
2291  // invalid in a debug build, because that might take a
2292  // lot of space.
2293  for (LO k = 0; k < numEntries; ++k) {
2294  if (! dstColMap.isNodeLocalElement (lclSrcCols[k])) {
2295  lclErr = true;
2296 #ifdef HAVE_TPETRA_DEBUG
2297  (void) invalidDstCopyCols.insert (lclSrcCols[k]);
2298 #endif // HAVE_TPETRA_DEBUG
2299  }
2300  }
2301  }
2302  }
2303  }
2304  } // for each "same" local row
2305 
2306  // Copy the "permute" local rows.
2307  for (size_t k = 0; k < numPermute; ++k) {
2308  const LO srcLclRow = static_cast<LO> (permuteFromLIDs[k]);
2309  const LO dstLclRow = static_cast<LO> (permuteToLIDs[k]);
2310 
2311  const LO* lclSrcCols;
2312  Scalar* vals;
2313  LO numEntries;
2314  LO err = src->getLocalRowView (srcLclRow, lclSrcCols, vals, numEntries);
2315  if (err != 0) {
2316  lclErr = true;
2317 #ifdef HAVE_TPETRA_DEBUG
2318  invalidPermuteFromRows.insert (srcLclRow);
2319 #endif // HAVE_TPETRA_DEBUG
2320  }
2321  else {
2322  err = this->replaceLocalValues (dstLclRow, lclSrcCols, vals, numEntries);
2323  if (err != numEntries) {
2324  lclErr = true;
2325 #ifdef HAVE_TPETRA_DEBUG
2326  for (LO c = 0; c < numEntries; ++c) {
2327  if (! dstColMap.isNodeLocalElement (lclSrcCols[c])) {
2328  invalidDstPermuteCols.insert (lclSrcCols[c]);
2329  }
2330  }
2331 #endif // HAVE_TPETRA_DEBUG
2332  }
2333  }
2334  }
2335  }
2336  else { // must convert column indices to global
2337  // Reserve space to store the destination matrix's local column indices.
2338  const size_t maxNumEnt = src->graph_.getNodeMaxNumRowEntries ();
2339  Teuchos::Array<LO> lclDstCols (maxNumEnt);
2340 
2341  // Copy local rows that are the "same" in both source and target.
2342  for (LO localRow = 0; localRow < static_cast<LO> (numSameIDs); ++localRow) {
2343  const LO* lclSrcCols;
2344  Scalar* vals;
2345  LO numEntries;
2346  // If this call fails, that means the mesh row local index is
2347  // invalid. That means the Import or Export is invalid somehow.
2348  LO err = 0;
2349  try {
2350  err = src->getLocalRowView (localRow, lclSrcCols, vals, numEntries);
2351  } catch (std::exception& e) {
2352  if (debug) {
2353  std::ostringstream os;
2354  const int myRank = this->graph_.getRowMap ()->getComm ()->getRank ();
2355  os << "Proc " << myRank << ": copyAndPermute: At \"same\" localRow "
2356  << localRow << ", src->getLocalRowView() threw an exception: "
2357  << e.what ();
2358  std::cerr << os.str ();
2359  }
2360  throw e;
2361  }
2362 
2363  if (err != 0) {
2364  lclErr = true;
2365 #ifdef HAVE_TPETRA_DEBUG
2366  invalidSrcCopyRows.insert (localRow);
2367 #endif // HAVE_TPETRA_DEBUG
2368  }
2369  else {
2370  if (static_cast<size_t> (numEntries) > static_cast<size_t> (lclDstCols.size ())) {
2371  lclErr = true;
2372  if (debug) {
2373  std::ostringstream os;
2374  const int myRank = this->graph_.getRowMap ()->getComm ()->getRank ();
2375  os << "Proc " << myRank << ": copyAndPermute: At \"same\" localRow "
2376  << localRow << ", numEntries = " << numEntries << " > maxNumEnt = "
2377  << maxNumEnt << endl;
2378  std::cerr << os.str ();
2379  }
2380  }
2381  else {
2382  // Convert the source matrix's local column indices to the
2383  // destination matrix's local column indices.
2384  Teuchos::ArrayView<LO> lclDstColsView = lclDstCols.view (0, numEntries);
2385  for (LO j = 0; j < numEntries; ++j) {
2386  lclDstColsView[j] = dstColMap.getLocalElement (srcColMap.getGlobalElement (lclSrcCols[j]));
2387  if (lclDstColsView[j] == Teuchos::OrdinalTraits<LO>::invalid ()) {
2388  lclErr = true;
2389 #ifdef HAVE_TPETRA_DEBUG
2390  invalidDstCopyCols.insert (lclDstColsView[j]);
2391 #endif // HAVE_TPETRA_DEBUG
2392  }
2393  }
2394  try {
2395  err = this->replaceLocalValues (localRow, lclDstColsView.getRawPtr (), vals, numEntries);
2396  } catch (std::exception& e) {
2397  if (debug) {
2398  std::ostringstream os;
2399  const int myRank = this->graph_.getRowMap ()->getComm ()->getRank ();
2400  os << "Proc " << myRank << ": copyAndPermute: At \"same\" localRow "
2401  << localRow << ", this->replaceLocalValues() threw an exception: "
2402  << e.what ();
2403  std::cerr << os.str ();
2404  }
2405  throw e;
2406  }
2407  if (err != numEntries) {
2408  lclErr = true;
2409  if (debug) {
2410  std::ostringstream os;
2411  const int myRank = this->graph_.getRowMap ()->getComm ()->getRank ();
2412  os << "Proc " << myRank << ": copyAndPermute: At \"same\" "
2413  "localRow " << localRow << ", this->replaceLocalValues "
2414  "returned " << err << " instead of numEntries = "
2415  << numEntries << endl;
2416  std::cerr << os.str ();
2417  }
2418  }
2419  }
2420  }
2421  }
2422 
2423  // Copy the "permute" local rows.
2424  for (size_t k = 0; k < numPermute; ++k) {
2425  const LO srcLclRow = static_cast<LO> (permuteFromLIDs[k]);
2426  const LO dstLclRow = static_cast<LO> (permuteToLIDs[k]);
2427 
2428  const LO* lclSrcCols;
2429  Scalar* vals;
2430  LO numEntries;
2431  LO err = 0;
2432  try {
2433  err = src->getLocalRowView (srcLclRow, lclSrcCols, vals, numEntries);
2434  } catch (std::exception& e) {
2435  if (debug) {
2436  std::ostringstream os;
2437  const int myRank = this->graph_.getRowMap ()->getComm ()->getRank ();
2438  os << "Proc " << myRank << ": copyAndPermute: At \"permute\" "
2439  "srcLclRow " << srcLclRow << " and dstLclRow " << dstLclRow
2440  << ", src->getLocalRowView() threw an exception: " << e.what ();
2441  std::cerr << os.str ();
2442  }
2443  throw e;
2444  }
2445 
2446  if (err != 0) {
2447  lclErr = true;
2448 #ifdef HAVE_TPETRA_DEBUG
2449  invalidPermuteFromRows.insert (srcLclRow);
2450 #endif // HAVE_TPETRA_DEBUG
2451  }
2452  else {
2453  if (static_cast<size_t> (numEntries) > static_cast<size_t> (lclDstCols.size ())) {
2454  lclErr = true;
2455  }
2456  else {
2457  // Convert the source matrix's local column indices to the
2458  // destination matrix's local column indices.
2459  Teuchos::ArrayView<LO> lclDstColsView = lclDstCols.view (0, numEntries);
2460  for (LO j = 0; j < numEntries; ++j) {
2461  lclDstColsView[j] = dstColMap.getLocalElement (srcColMap.getGlobalElement (lclSrcCols[j]));
2462  if (lclDstColsView[j] == Teuchos::OrdinalTraits<LO>::invalid ()) {
2463  lclErr = true;
2464 #ifdef HAVE_TPETRA_DEBUG
2465  invalidDstPermuteCols.insert (lclDstColsView[j]);
2466 #endif // HAVE_TPETRA_DEBUG
2467  }
2468  }
2469  err = this->replaceLocalValues (dstLclRow, lclDstColsView.getRawPtr (), vals, numEntries);
2470  if (err != numEntries) {
2471  lclErr = true;
2472  }
2473  }
2474  }
2475  }
2476  }
2477 
2478  if (lclErr) {
2479  std::ostream& err = this->markLocalErrorAndGetStream ();
2480 #ifdef HAVE_TPETRA_DEBUG
2481  err << "copyAndPermute: The graph structure of the source of the "
2482  "Import or Export must be a subset of the graph structure of the "
2483  "target. ";
2484  err << "invalidSrcCopyRows = [";
2485  for (typename std::set<LO>::const_iterator it = invalidSrcCopyRows.begin ();
2486  it != invalidSrcCopyRows.end (); ++it) {
2487  err << *it;
2488  typename std::set<LO>::const_iterator itp1 = it;
2489  itp1++;
2490  if (itp1 != invalidSrcCopyRows.end ()) {
2491  err << ",";
2492  }
2493  }
2494  err << "], invalidDstCopyRows = [";
2495  for (typename std::set<LO>::const_iterator it = invalidDstCopyRows.begin ();
2496  it != invalidDstCopyRows.end (); ++it) {
2497  err << *it;
2498  typename std::set<LO>::const_iterator itp1 = it;
2499  itp1++;
2500  if (itp1 != invalidDstCopyRows.end ()) {
2501  err << ",";
2502  }
2503  }
2504  err << "], invalidDstCopyCols = [";
2505  for (typename std::set<LO>::const_iterator it = invalidDstCopyCols.begin ();
2506  it != invalidDstCopyCols.end (); ++it) {
2507  err << *it;
2508  typename std::set<LO>::const_iterator itp1 = it;
2509  itp1++;
2510  if (itp1 != invalidDstCopyCols.end ()) {
2511  err << ",";
2512  }
2513  }
2514  err << "], invalidDstPermuteCols = [";
2515  for (typename std::set<LO>::const_iterator it = invalidDstPermuteCols.begin ();
2516  it != invalidDstPermuteCols.end (); ++it) {
2517  err << *it;
2518  typename std::set<LO>::const_iterator itp1 = it;
2519  itp1++;
2520  if (itp1 != invalidDstPermuteCols.end ()) {
2521  err << ",";
2522  }
2523  }
2524  err << "], invalidPermuteFromRows = [";
2525  for (typename std::set<LO>::const_iterator it = invalidPermuteFromRows.begin ();
2526  it != invalidPermuteFromRows.end (); ++it) {
2527  err << *it;
2528  typename std::set<LO>::const_iterator itp1 = it;
2529  itp1++;
2530  if (itp1 != invalidPermuteFromRows.end ()) {
2531  err << ",";
2532  }
2533  }
2534  err << "]" << std::endl;
2535 #else
2536  err << "copyAndPermute: The graph structure of the source of the "
2537  "Import or Export must be a subset of the graph structure of the "
2538  "target." << std::endl;
2539 #endif // HAVE_TPETRA_DEBUG
2540  }
2541 
2542  if (debug) {
2543  std::ostringstream os;
2544  const int myRank = this->graph_.getRowMap ()->getComm ()->getRank ();
2545  const bool lclSuccess = ! (* (this->localError_));
2546  os << "*** Proc " << myRank << ": copyAndPermute "
2547  << (lclSuccess ? "succeeded" : "FAILED");
2548  if (lclSuccess) {
2549  os << endl;
2550  } else {
2551  os << ": error messages: " << this->errorMessages (); // comes w/ endl
2552  }
2553  std::cerr << os.str ();
2554  }
2555  }
2556 
2557  namespace { // (anonymous)
2558 
2577  template<class LO, class GO, class D>
2578  size_t
2579  packRowCount (const size_t numEnt,
2580  const size_t numBytesPerValue,
2581  const size_t blkSize)
2582  {
2584 
2585  if (numEnt == 0) {
2586  // Empty rows always take zero bytes, to ensure sparsity.
2587  return 0;
2588  }
2589  else {
2590  // We store the number of entries as a local index (LO).
2591  LO numEntLO = 0; // packValueCount wants this.
2592  GO gid;
2593  const size_t numEntLen = PackTraits<LO, D>::packValueCount (numEntLO);
2594  const size_t gidsLen = numEnt * PackTraits<GO, D>::packValueCount (gid);
2595  const size_t valsLen = numEnt * numBytesPerValue * blkSize * blkSize;
2596  return numEntLen + gidsLen + valsLen;
2597  }
2598  }
2599 
2610  template<class ST, class LO, class GO, class D>
2611  size_t
2612  unpackRowCount (const typename Tpetra::Details::PackTraits<LO, D>::input_buffer_type& imports,
2613  const size_t offset,
2614  const size_t numBytes,
2615  const size_t numBytesPerValue)
2616  {
2617  using Kokkos::subview;
2619  typedef typename PackTraits<LO, D>::input_buffer_type input_buffer_type;
2620  typedef typename input_buffer_type::size_type size_type;
2621 
2622  if (numBytes == 0) {
2623  // Empty rows always take zero bytes, to ensure sparsity.
2624  return static_cast<size_t> (0);
2625  }
2626  else {
2627  LO numEntLO = 0;
2628  const size_t theNumBytes = PackTraits<LO, D>::packValueCount (numEntLO);
2629 #ifdef HAVE_TPETRA_DEBUG
2630  TEUCHOS_TEST_FOR_EXCEPTION(
2631  theNumBytes > numBytes, std::logic_error, "unpackRowCount: "
2632  "theNumBytes = " << theNumBytes << " < numBytes = " << numBytes
2633  << ".");
2634 #endif // HAVE_TPETRA_DEBUG
2635  const std::pair<size_type, size_type> rng (offset, offset + theNumBytes);
2636  input_buffer_type inBuf = subview (imports, rng); // imports (offset, theNumBytes);
2637  const size_t actualNumBytes = PackTraits<LO, D>::unpackValue (numEntLO, inBuf);
2638 #ifdef HAVE_TPETRA_DEBUG
2639  TEUCHOS_TEST_FOR_EXCEPTION(
2640  theNumBytes > numBytes, std::logic_error, "unpackRowCount: "
2641  "actualNumBytes = " << actualNumBytes << " < numBytes = " << numBytes
2642  << ".");
2643 #else
2644  (void) actualNumBytes;
2645 #endif // HAVE_TPETRA_DEBUG
2646  return static_cast<size_t> (numEntLO);
2647  }
2648  }
2649 
2657  template<class ST, class LO, class GO, class D>
2658  size_t
2659  packRowForBlockCrs (const typename Tpetra::Details::PackTraits<LO, D>::output_buffer_type& exports,
2660  const size_t offset,
2661  const size_t numEnt,
2664  const size_t numBytesPerValue,
2665  const size_t blockSize)
2666  {
2667  using Kokkos::subview;
2669  // NOTE (mfh 02 Feb 2015) This assumes that output_buffer_type is
2670  // the same, no matter what type we're packing. It's a reasonable
2671  // assumption, given that we go through the trouble of PackTraits
2672  // just so that we can pack data of different types in the same
2673  // buffer.
2674  typedef typename PackTraits<LO, D>::output_buffer_type output_buffer_type;
2675  typedef typename output_buffer_type::size_type size_type;
2676  typedef typename std::pair<size_type, size_type> pair_type;
2677 
2678  if (numEnt == 0) {
2679  // Empty rows always take zero bytes, to ensure sparsity.
2680  return 0;
2681  }
2682  const size_t numScalarEnt = numEnt * blockSize * blockSize;
2683 
2684  const GO gid = 0; // packValueCount wants this
2685  const LO numEntLO = static_cast<size_t> (numEnt);
2686 
2687  const size_t numEntBeg = offset;
2688  const size_t numEntLen = PackTraits<LO, D>::packValueCount (numEntLO);
2689  const size_t gidsBeg = numEntBeg + numEntLen;
2690  const size_t gidsLen = numEnt * PackTraits<GO, D>::packValueCount (gid);
2691  const size_t valsBeg = gidsBeg + gidsLen;
2692  const size_t valsLen = numScalarEnt * numBytesPerValue;
2693 
2694  output_buffer_type numEntOut =
2695  subview (exports, pair_type (numEntBeg, numEntBeg + numEntLen));
2696  output_buffer_type gidsOut =
2697  subview (exports, pair_type (gidsBeg, gidsBeg + gidsLen));
2698  output_buffer_type valsOut =
2699  subview (exports, pair_type (valsBeg, valsBeg + valsLen));
2700 
2701  size_t numBytesOut = 0;
2702  numBytesOut += PackTraits<LO, D>::packValue (numEntOut, numEntLO);
2703  numBytesOut += PackTraits<GO, D>::packArray (gidsOut, gidsIn, numEnt);
2704  numBytesOut += PackTraits<ST, D>::packArray (valsOut, valsIn, numScalarEnt);
2705 
2706  const size_t expectedNumBytes = numEntLen + gidsLen + valsLen;
2707  TEUCHOS_TEST_FOR_EXCEPTION(
2708  numBytesOut != expectedNumBytes, std::logic_error, "unpackRow: "
2709  "numBytesOut = " << numBytesOut << " != expectedNumBytes = "
2710  << expectedNumBytes << ".");
2711  return numBytesOut;
2712  }
2713 
2714  // Return the number of bytes actually read / used.
2715  template<class ST, class LO, class GO, class D>
2716  size_t
2717  unpackRowForBlockCrs (const typename Tpetra::Details::PackTraits<GO, D>::output_array_type& gidsOut,
2720  const size_t offset,
2721  const size_t numBytes,
2722  const size_t numEnt,
2723  const size_t numBytesPerValue,
2724  const size_t blockSize)
2725  {
2726  using Kokkos::subview;
2728  // NOTE (mfh 02 Feb 2015) This assumes that input_buffer_type is
2729  // the same, no matter what type we're unpacking. It's a
2730  // reasonable assumption, given that we go through the trouble of
2731  // PackTraits just so that we can pack data of different types in
2732  // the same buffer.
2733  typedef typename PackTraits<LO, D>::input_buffer_type input_buffer_type;
2734  typedef typename input_buffer_type::size_type size_type;
2735  typedef typename std::pair<size_type, size_type> pair_type;
2736 
2737  if (numBytes == 0) {
2738  // Rows with zero bytes always have zero entries.
2739  return 0;
2740  }
2741  const size_t numScalarEnt = numEnt * blockSize * blockSize;
2742  TEUCHOS_TEST_FOR_EXCEPTION(
2743  static_cast<size_t> (imports.dimension_0 ()) <= offset,
2744  std::logic_error, "unpackRow: imports.dimension_0() = "
2745  << imports.dimension_0 () << " <= offset = " << offset << ".");
2746  TEUCHOS_TEST_FOR_EXCEPTION(
2747  static_cast<size_t> (imports.dimension_0 ()) < offset + numBytes,
2748  std::logic_error, "unpackRow: imports.dimension_0() = "
2749  << imports.dimension_0 () << " < offset + numBytes = "
2750  << (offset + numBytes) << ".");
2751 
2752  const GO gid = 0; // packValueCount wants this
2753  const LO lid = 0; // packValueCount wants this
2754 
2755  const size_t numEntBeg = offset;
2756  const size_t numEntLen = PackTraits<LO, D>::packValueCount (lid);
2757  const size_t gidsBeg = numEntBeg + numEntLen;
2758  const size_t gidsLen = numEnt * PackTraits<GO, D>::packValueCount (gid);
2759  const size_t valsBeg = gidsBeg + gidsLen;
2760  const size_t valsLen = numScalarEnt * numBytesPerValue;
2761 
2762  input_buffer_type numEntIn =
2763  subview (imports, pair_type (numEntBeg, numEntBeg + numEntLen));
2764  input_buffer_type gidsIn =
2765  subview (imports, pair_type (gidsBeg, gidsBeg + gidsLen));
2766  input_buffer_type valsIn =
2767  subview (imports, pair_type (valsBeg, valsBeg + valsLen));
2768 
2769  size_t numBytesOut = 0;
2770  LO numEntOut;
2771  numBytesOut += PackTraits<LO, D>::unpackValue (numEntOut, numEntIn);
2772  TEUCHOS_TEST_FOR_EXCEPTION(
2773  static_cast<size_t> (numEntOut) != numEnt, std::logic_error,
2774  "unpackRow: Expected number of entries " << numEnt
2775  << " != actual number of entries " << numEntOut << ".");
2776 
2777  numBytesOut += PackTraits<GO, D>::unpackArray (gidsOut, gidsIn, numEnt);
2778  numBytesOut += PackTraits<ST, D>::unpackArray (valsOut, valsIn, numScalarEnt);
2779 
2780  TEUCHOS_TEST_FOR_EXCEPTION(
2781  numBytesOut != numBytes, std::logic_error, "unpackRow: numBytesOut = "
2782  << numBytesOut << " != numBytes = " << numBytes << ".");
2783  const size_t expectedNumBytes = numEntLen + gidsLen + valsLen;
2784  TEUCHOS_TEST_FOR_EXCEPTION(
2785  numBytesOut != expectedNumBytes, std::logic_error, "unpackRow: "
2786  "numBytesOut = " << numBytesOut << " != expectedNumBytes = "
2787  << expectedNumBytes << ".");
2788  return numBytesOut;
2789  }
2790  } // namespace (anonymous)
2791 
2792  template<class Scalar, class LO, class GO, class Node>
2793  void
2794  BlockCrsMatrix<Scalar, LO, GO, Node>::
2795  packAndPrepare (const Tpetra::SrcDistObject& source,
2796  const Teuchos::ArrayView<const LO>& exportLIDs,
2797  Teuchos::Array<packet_type>& exports,
2798  const Teuchos::ArrayView<size_t>& numPacketsPerLID,
2799  size_t& constantNumPackets,
2800  Tpetra::Distributor& /* distor */)
2801  {
2802  using std::endl;
2804  using Kokkos::MemoryUnmanaged;
2805  using Kokkos::subview;
2806  using Kokkos::View;
2808  typedef typename View<int*, device_type>::HostMirror::execution_space HES;
2809  typedef BlockCrsMatrix<Scalar, LO, GO, Node> this_type;
2810  typedef typename Teuchos::ArrayView<const LO>::size_type size_type;
2811  const bool debug = false;
2812 
2813  if (debug) {
2814  std::ostringstream os;
2815  const int myRank = this->graph_.getRowMap ()->getComm ()->getRank ();
2816  os << "Proc " << myRank << ": packAndPrepare: exportLIDs.size() = "
2817  << exportLIDs.size () << ", numPacketsPerLID.size() = "
2818  << numPacketsPerLID.size () << endl;
2819  std::cerr << os.str ();
2820  }
2821 
2822  if (* (this->localError_)) {
2823  std::ostream& err = this->markLocalErrorAndGetStream ();
2824  err << "packAndPrepare: The target object of the Import or Export is "
2825  "already in an error state." << endl;
2826  return;
2827  }
2828 
2829  const this_type* src = dynamic_cast<const this_type* > (&source);
2830  // Should have checked for these cases in checkSizes().
2831  if (src == NULL) {
2832  std::ostream& err = this->markLocalErrorAndGetStream ();
2833  err << "packAndPrepare: The source (input) object of the Import or "
2834  "Export is either not a BlockCrsMatrix, or does not have the right "
2835  "template parameters. checkSizes() should have caught this. "
2836  "Please report this bug to the Tpetra developers." << endl;
2837  return;
2838  }
2839 
2840  const crs_graph_type& srcGraph = src->graph_;
2841  const size_t blockSize = static_cast<size_t> (src->getBlockSize ());
2842  const size_type numExportLIDs = exportLIDs.size ();
2843 
2844  if (numExportLIDs != numPacketsPerLID.size ()) {
2845  std::ostream& err = this->markLocalErrorAndGetStream ();
2846  err << "packAndPrepare: exportLIDs.size() = " << numExportLIDs
2847  << " != numPacketsPerLID.size() = " << numPacketsPerLID.size ()
2848  << "." << endl;
2849  return;
2850  }
2851 
2852  // Graphs and matrices are allowed to have a variable number of
2853  // entries per row. We could test whether all rows have the same
2854  // number of entries, but DistObject can only use this
2855  // optimization if all rows on _all_ processes have the same
2856  // number of entries. Rather than do the all-reduce necessary to
2857  // test for this unlikely case, we tell DistObject (by setting
2858  // constantNumPackets to zero) to assume that different rows may
2859  // have different numbers of entries.
2860  constantNumPackets = 0;
2861 
2862  // Compute the number of bytes ("packets") per row to pack. While
2863  // we're at it, compute the total # of block entries to send, and
2864  // the max # of block entries in any of the rows we're sending.
2865  size_t totalNumBytes = 0;
2866  size_t totalNumEntries = 0;
2867  size_t maxRowLength = 0;
2868  for (size_type i = 0; i < numExportLIDs; ++i) {
2869  const LO lclRow = exportLIDs[i];
2870  size_t numEnt = srcGraph.getNumEntriesInLocalRow (lclRow);
2871  // If any given LIDs are invalid, the above might return either
2872  // zero or the invalid size_t value. If the former, we have no
2873  // way to tell, but that's OK; it just means the calling process
2874  // won't pack anything (it has nothing to pack anyway). If the
2875  // latter, we replace it with zero (that row is not owned by the
2876  // calling process, so it has no entries to pack).
2877  if (numEnt == Teuchos::OrdinalTraits<size_t>::invalid ()) {
2878  numEnt = 0;
2879  }
2880  const size_t numScalarEnt = numEnt * blockSize * blockSize;
2881 
2882  // The 'if' branch implicitly assumes that packRowCount() returns
2883  // zero if numEnt == 0.
2884  size_t numBytesPerValue = 0;
2885  if (numEnt > 0) {
2886  // Get a locally indexed view of the current row's data. If
2887  // the current row has > 0 entries, we need an entry in order
2888  // to figure out the byte count of the packed row. (We really
2889  // only need it if ST's size is determined at run time.)
2890  Scalar* valsRaw = NULL;
2891  const LO* lidsRaw = NULL;
2892  LO actualNumEnt = 0;
2893  const LO errCode =
2894  src->getLocalRowView (lclRow, lidsRaw, valsRaw, actualNumEnt);
2895 
2896  if (numEnt != static_cast<size_t> (actualNumEnt)) {
2897  std::ostream& err = this->markLocalErrorAndGetStream ();
2898  err << "packAndPrepare: Local row " << i << " claims to have " <<
2899  numEnt << "entry/ies, but the View returned by getLocalRowView() "
2900  "has " << actualNumEnt << " entry/ies. This should never happen. "
2901  "Please report this bug to the Tpetra developers." << endl;
2902  return;
2903  }
2904  if (errCode == Teuchos::OrdinalTraits<LO>::invalid ()) {
2905  std::ostream& err = this->markLocalErrorAndGetStream ();
2906  err << "packAndPrepare: Local row " << i << " is not in the row Map "
2907  "of the source object on the calling process." << endl;
2908  return;
2909  }
2910 
2911  const ST* valsRawST =
2912  const_cast<const ST*> (reinterpret_cast<ST*> (valsRaw));
2913  View<const ST*, HES, MemoryUnmanaged> vals (valsRawST, numScalarEnt);
2914 
2915  // NOTE (mfh 07 Feb 2015) Since we're using the host memory
2916  // space here for now, this doesn't assume UVM. That may change
2917  // in the future, if we ever start packing on the device.
2918  numBytesPerValue = PackTraits<ST, HES>::packValueCount (vals(0));
2919  }
2920 
2921  const size_t numBytes =
2922  packRowCount<LO, GO, HES> (numEnt, numBytesPerValue, blockSize);
2923  numPacketsPerLID[i] = numBytes;
2924  totalNumBytes += numBytes;
2925  totalNumEntries += numEnt;
2926  maxRowLength = std::max (maxRowLength, numEnt);
2927  }
2928 
2929  if (debug) {
2930  const int myRank = graph_.getComm ()->getRank ();
2931  std::ostringstream os;
2932  os << "Proc " << myRank << ": packAndPrepare: totalNumBytes = "
2933  << totalNumBytes << endl;
2934  std::cerr << os.str ();
2935  }
2936 
2937  // We use a "struct of arrays" approach to packing each row's
2938  // entries. All the column indices (as global indices) go first,
2939  // then all their owning process ranks, and then the values.
2940  exports.resize (totalNumBytes);
2941  if (totalNumEntries > 0) {
2942  View<char*, HES, MemoryUnmanaged> exportsK (exports.getRawPtr (),
2943  totalNumBytes);
2944 
2945  // Current position (in bytes) in the 'exports' output array.
2946  size_t offset = 0;
2947 
2948  // For each block row of the matrix owned by the calling
2949  // process, pack that block row's column indices and values into
2950  // the exports array.
2951 
2952  // Source matrix's column Map. We verified in checkSizes() that
2953  // the column Map exists (is not null).
2954  const map_type& srcColMap = * (srcGraph.getColMap ());
2955 
2956  // Temporary buffer for global column indices.
2957  View<GO*, HES> gblColInds;
2958  {
2959  GO gid = 0;
2960  gblColInds = PackTraits<GO, HES>::allocateArray (gid, maxRowLength, "gids");
2961  }
2962 
2963  // Pack the data for each row to send, into the 'exports' buffer.
2964  for (size_type i = 0; i < numExportLIDs; ++i) {
2965  const LO lclRowInd = exportLIDs[i];
2966  const LO* lclColIndsRaw;
2967  Scalar* valsRaw;
2968  LO numEntLO;
2969  // It's OK to ignore the return value, since if the calling
2970  // process doesn't own that local row, then the number of
2971  // entries in that row on the calling process is zero.
2972  (void) src->getLocalRowView (lclRowInd, lclColIndsRaw, valsRaw, numEntLO);
2973  const size_t numEnt = static_cast<size_t> (numEntLO);
2974  const size_t numScalarEnt = numEnt * blockSize * blockSize;
2975  View<const LO*, HES, MemoryUnmanaged> lclColInds (lclColIndsRaw, numEnt);
2976  const ST* valsRawST = const_cast<const ST*> (reinterpret_cast<ST*> (valsRaw));
2977  View<const ST*, HES, MemoryUnmanaged> vals (valsRawST, numScalarEnt);
2978 
2979  // NOTE (mfh 07 Feb 2015) Since we're using the host memory
2980  // space here for now, this doesn't assume UVM. That may
2981  // change in the future, if we ever start packing on device.
2982  const size_t numBytesPerValue = numEnt == 0 ?
2983  static_cast<size_t> (0) :
2984  PackTraits<ST, HES>::packValueCount (vals(0));
2985 
2986  // Convert column indices from local to global.
2987  for (size_t j = 0; j < numEnt; ++j) {
2988  gblColInds(j) = srcColMap.getGlobalElement (lclColInds(j));
2989  }
2990 
2991  // Copy the row's data into the current spot in the exports array.
2992  const size_t numBytes =
2993  packRowForBlockCrs<ST, LO, GO, HES> (exportsK, offset, numEnt, gblColInds,
2994  vals, numBytesPerValue, blockSize);
2995  // Keep track of how many bytes we packed.
2996  offset += numBytes;
2997  } // for each LID (of a row) to send
2998 
2999  if (offset != totalNumBytes) {
3000  std::ostream& err = this->markLocalErrorAndGetStream ();
3001  err << "packAndPreapre: At end of method, the final offset (in bytes) "
3002  << offset << " does not equal the total number of bytes packed "
3003  << totalNumBytes << ". "
3004  << "Please report this bug to the Tpetra developers." << endl;
3005  return;
3006  }
3007  } // if totalNumEntries > 0
3008 
3009  if (debug) {
3010  std::ostringstream os;
3011  const int myRank = this->graph_.getRowMap ()->getComm ()->getRank ();
3012  const bool lclSuccess = ! (* (this->localError_));
3013  os << "*** Proc " << myRank << ": packAndPrepare "
3014  << (lclSuccess ? "succeeded" : "FAILED")
3015  << " (totalNumEntries = " << totalNumEntries << ") ***" << endl;
3016  std::cerr << os.str ();
3017  }
3018  }
3019 
3020 
3021  template<class Scalar, class LO, class GO, class Node>
3022  void
3023  BlockCrsMatrix<Scalar, LO, GO, Node>::
3024  unpackAndCombine (const Teuchos::ArrayView<const LO>& importLIDs,
3025  const Teuchos::ArrayView<const packet_type>& imports,
3026  const Teuchos::ArrayView<size_t>& numPacketsPerLID,
3027  size_t /* constantNumPackets */, // not worthwhile to use this
3028  Tpetra::Distributor& /* distor */,
3030  {
3031  using std::endl;
3033  using Kokkos::MemoryUnmanaged;
3034  using Kokkos::subview;
3035  using Kokkos::View;
3037  typedef typename Teuchos::ArrayView<const LO>::size_type size_type;
3038  typedef typename View<int*, device_type>::HostMirror::execution_space HES;
3039  typedef std::pair<typename View<int*, HES>::size_type,
3040  typename View<int*, HES>::size_type> pair_type;
3041  typedef View<GO*, HES, MemoryUnmanaged> gids_out_type;
3042  typedef View<LO*, HES, MemoryUnmanaged> lids_out_type;
3043  typedef View<ST*, HES, MemoryUnmanaged> vals_out_type;
3044  typedef typename PackTraits<GO, HES>::input_buffer_type input_buffer_type;
3045  const char prefix[] = "Tpetra::Experimental::BlockCrsMatrix::unpackAndCombine: ";
3046  const bool debug = false;
3047 
3048  if (debug) {
3049  std::ostringstream os;
3050  const int myRank = this->graph_.getRowMap ()->getComm ()->getRank ();
3051  os << "Proc " << myRank << ": unpackAndCombine" << endl;
3052  std::cerr << os.str ();
3053  }
3054 
3055  // It should not cause deadlock to return on error in this method,
3056  // since this method does not communicate.
3057 
3058  if (* (this->localError_)) {
3059  std::ostream& err = this->markLocalErrorAndGetStream ();
3060  err << prefix << "The target object of the Import or Export is "
3061  "already in an error state." << endl;
3062  return;
3063  }
3064  if (importLIDs.size () != numPacketsPerLID.size ()) {
3065  std::ostream& err = this->markLocalErrorAndGetStream ();
3066  err << prefix << "importLIDs.size() = " << importLIDs.size () << " != "
3067  "numPacketsPerLID.size() = " << numPacketsPerLID.size () << "." << endl;
3068  return;
3069  }
3070  if (CM != ADD && CM != INSERT && CM != REPLACE && CM != ABSMAX && CM != ZERO) {
3071  std::ostream& err = this->markLocalErrorAndGetStream ();
3072  err << prefix << "Invalid CombineMode value " << CM << ". Valid "
3073  << "values include ADD, INSERT, REPLACE, ABSMAX, and ZERO."
3074  << endl;
3075  return;
3076  }
3077 
3078  // Target matrix's column Map. Use to convert the global column
3079  // indices in the receive buffer to local indices. We verified in
3080  // checkSizes() that the column Map exists (is not null).
3081  const map_type& tgtColMap = * (this->graph_.getColMap ());
3082 
3083  const size_type numImportLIDs = importLIDs.size ();
3084  if (CM == ZERO || numImportLIDs == 0) {
3085  if (debug) {
3086  std::ostringstream os;
3087  const int myRank = this->graph_.getRowMap ()->getComm ()->getRank ();
3088  os << "Proc " << myRank << ": unpackAndCombine: Nothing to do" << endl;
3089  std::cerr << os.str ();
3090  }
3091  return; // nothing to do; no need to combine entries
3092  }
3093 
3094  if (debug) {
3095  std::ostringstream os;
3096  const int myRank = this->graph_.getRowMap ()->getComm ()->getRank ();
3097  os << "Proc " << myRank << ": unpackAndCombine: Getting ready" << endl;
3098  std::cerr << os.str ();
3099  }
3100 
3101  input_buffer_type importsK (imports.getRawPtr (), imports.size ());
3102  const size_t blockSize = this->getBlockSize ();
3103  const size_t maxRowNumEnt = graph_.getNodeMaxNumRowEntries ();
3104  const size_t maxRowNumScalarEnt = maxRowNumEnt * blockSize * blockSize;
3105 
3106  size_t numBytesPerValue;
3107  {
3108  // FIXME (mfh 17 Feb 2015) What do I do about Scalar types with
3109  // run-time size? We already assume that all entries in both
3110  // the source and target matrices have the same size. If the
3111  // calling process owns at least one entry in either matrix, we
3112  // can use that entry to set the size. However, it is possible
3113  // that the calling process owns no entries. In that case,
3114  // we're in trouble. One way to fix this would be for each
3115  // row's data to contain the run-time size. This is only
3116  // necessary if the size is not a compile-time constant.
3117  Scalar val;
3118  numBytesPerValue = PackTraits<ST, HES>::packValueCount (val);
3119  }
3120 
3121  // Temporary space to cache incoming global column indices and
3122  // values. Column indices come in as global indices, in case the
3123  // source object's column Map differs from the target object's
3124  // (this's) column Map.
3125  View<GO*, HES> gblColInds;
3126  View<LO*, HES> lclColInds;
3127  View<ST*, HES> vals;
3128  {
3129  GO gid = 0;
3130  LO lid = 0;
3131  // FIXME (mfh 17 Feb 2015) What do I do about Scalar types with
3132  // run-time size? We already assume that all entries in both
3133  // the source and target matrices have the same size. If the
3134  // calling process owns at least one entry in either matrix, we
3135  // can use that entry to set the size. However, it is possible
3136  // that the calling process owns no entries. In that case,
3137  // we're in trouble. One way to fix this would be for each
3138  // row's data to contain the run-time size. This is only
3139  // necessary if the size is not a compile-time constant.
3140  Scalar val;
3141  gblColInds = PackTraits<GO, HES>::allocateArray (gid, maxRowNumEnt, "gids");
3142  lclColInds = PackTraits<LO, HES>::allocateArray (lid, maxRowNumEnt, "lids");
3143  vals = PackTraits<ST, HES>::allocateArray (val, maxRowNumScalarEnt, "vals");
3144  }
3145 
3146  size_t offset = 0;
3147  bool errorDuringUnpack = false;
3148  for (size_type i = 0; i < numImportLIDs; ++i) {
3149  const size_t numBytes = numPacketsPerLID[i];
3150  if (numBytes == 0) {
3151  continue; // empty buffer for that row means that the row is empty
3152  }
3153  const size_t numEnt =
3154  unpackRowCount<ST, LO, GO, HES> (importsK, offset, numBytes,
3155  numBytesPerValue);
3156  if (numEnt > maxRowNumEnt) {
3157  errorDuringUnpack = true;
3158 #ifdef HAVE_TPETRA_DEBUG
3159  std::ostream& err = this->markLocalErrorAndGetStream ();
3160  err << prefix << "At i = " << i << ", numEnt = " << numEnt
3161  << " > maxRowNumEnt = " << maxRowNumEnt << endl;
3162 #endif // HAVE_TPETRA_DEBUG
3163  continue;
3164  }
3165 
3166  const size_t numScalarEnt = numEnt * blockSize * blockSize;
3167  const LO lclRow = importLIDs[i];
3168 
3169  gids_out_type gidsOut = subview (gblColInds, pair_type (0, numEnt));
3170  vals_out_type valsOut = subview (vals, pair_type (0, numScalarEnt));
3171 
3172  const size_t numBytesOut =
3173  unpackRowForBlockCrs<ST, LO, GO, HES> (gidsOut, valsOut, importsK,
3174  offset, numBytes, numEnt,
3175  numBytesPerValue, blockSize);
3176  if (numBytes != numBytesOut) {
3177  errorDuringUnpack = true;
3178 #ifdef HAVE_TPETRA_DEBUG
3179  std::ostream& err = this->markLocalErrorAndGetStream ();
3180  err << prefix << "At i = " << i << ", numBytes = " << numBytes
3181  << " != numBytesOut = " << numBytesOut << ".";
3182 #endif // HAVE_TPETRA_DEBUG
3183  continue;
3184  }
3185 
3186  // Convert incoming global indices to local indices.
3187  lids_out_type lidsOut = subview (lclColInds, pair_type (0, numEnt));
3188  for (size_t k = 0; k < numEnt; ++k) {
3189  lidsOut(k) = tgtColMap.getLocalElement (gidsOut(k));
3190  if (lidsOut(k) == Teuchos::OrdinalTraits<LO>::invalid ()) {
3191  errorDuringUnpack = true;
3192 #ifdef HAVE_TPETRA_DEBUG
3193  std::ostream& err = this->markLocalErrorAndGetStream ();
3194  err << prefix << "At i = " << i << ", GID " << gidsOut(k)
3195  << " is not owned by the calling process.";
3196 #endif // HAVE_TPETRA_DEBUG
3197  continue;
3198  }
3199  }
3200 
3201  // Combine the incoming data with the matrix's current data.
3202  LO numCombd = 0;
3203  const LO* const lidsRaw = const_cast<const LO*> (lidsOut.ptr_on_device ());
3204  const Scalar* const valsRaw =
3205  reinterpret_cast<const Scalar*> (const_cast<const ST*> (valsOut.ptr_on_device ()));
3206  if (CM == ADD) {
3207  numCombd = this->sumIntoLocalValues (lclRow, lidsRaw, valsRaw, numEnt);
3208  } else if (CM == INSERT || CM == REPLACE) {
3209  numCombd = this->replaceLocalValues (lclRow, lidsRaw, valsRaw, numEnt);
3210  } else if (CM == ABSMAX) {
3211  numCombd = this->absMaxLocalValues (lclRow, lidsRaw, valsRaw, numEnt);
3212  }
3213 
3214  if (static_cast<LO> (numEnt) != numCombd) {
3215  errorDuringUnpack = true;
3216 #ifdef HAVE_TPETRA_DEBUG
3217  std::ostream& err = this->markLocalErrorAndGetStream ();
3218  err << prefix << "At i = " << i << ", numEnt = " << numEnt
3219  << " != numCombd = " << numCombd << ".";
3220 #endif // HAVE_TPETRA_DEBUG
3221  continue;
3222  }
3223 
3224  // Don't update offset until current LID has succeeded.
3225  offset += numBytes;
3226  } // for each import LID i
3227 
3228  if (errorDuringUnpack) {
3229  std::ostream& err = this->markLocalErrorAndGetStream ();
3230  err << prefix << "Unpacking failed.";
3231 #ifndef HAVE_TPETRA_DEBUG
3232  err << " Please run again with a debug build to get more verbose "
3233  "diagnostic output.";
3234 #endif // ! HAVE_TPETRA_DEBUG
3235  err << endl;
3236  }
3237 
3238  if (debug) {
3239  std::ostringstream os;
3240  const int myRank = this->graph_.getRowMap ()->getComm ()->getRank ();
3241  const bool lclSuccess = ! (* (this->localError_));
3242  os << "*** Proc " << myRank << ": unpackAndCombine "
3243  << (lclSuccess ? "succeeded" : "FAILED")
3244  << " ***" << endl;
3245  std::cerr << os.str ();
3246  }
3247  }
3248 
3249 
3250  template<class Scalar, class LO, class GO, class Node>
3251  std::string
3253  {
3254  using Teuchos::TypeNameTraits;
3255  std::ostringstream os;
3256  os << "\"Tpetra::BlockCrsMatrix\": { "
3257  << "Template parameters: { "
3258  << "Scalar: " << TypeNameTraits<Scalar>::name ()
3259  << "LO: " << TypeNameTraits<LO>::name ()
3260  << "GO: " << TypeNameTraits<GO>::name ()
3261  << "Node: " << TypeNameTraits<Node>::name ()
3262  << " }"
3263  << ", Label: \"" << this->getObjectLabel () << "\""
3264  << ", Global dimensions: ["
3265  << graph_.getDomainMap ()->getGlobalNumElements () << ", "
3266  << graph_.getRangeMap ()->getGlobalNumElements () << "]"
3267  << ", Block size: " << getBlockSize ()
3268  << " }";
3269  return os.str ();
3270  }
3271 
3272 
3273  template<class Scalar, class LO, class GO, class Node>
3274  void
3276  describe (Teuchos::FancyOStream& out,
3277  const Teuchos::EVerbosityLevel verbLevel) const
3278  {
3279  using Teuchos::ArrayRCP;
3280  using Teuchos::CommRequest;
3281  using Teuchos::FancyOStream;
3282  using Teuchos::getFancyOStream;
3283  using Teuchos::ireceive;
3284  using Teuchos::isend;
3285  using Teuchos::outArg;
3286  using Teuchos::TypeNameTraits;
3287  using Teuchos::VERB_DEFAULT;
3288  using Teuchos::VERB_NONE;
3289  using Teuchos::VERB_LOW;
3290  using Teuchos::VERB_MEDIUM;
3291  // using Teuchos::VERB_HIGH;
3292  using Teuchos::VERB_EXTREME;
3293  using Teuchos::RCP;
3294  using Teuchos::wait;
3295  using std::endl;
3296 #ifdef HAVE_TPETRA_DEBUG
3297  const char prefix[] = "Tpetra::Experimental::BlockCrsMatrix::describe: ";
3298 #endif // HAVE_TPETRA_DEBUG
3299 
3300  // Set default verbosity if applicable.
3301  const Teuchos::EVerbosityLevel vl =
3302  (verbLevel == VERB_DEFAULT) ? VERB_LOW : verbLevel;
3303 
3304  if (vl == VERB_NONE) {
3305  return; // print nothing
3306  }
3307 
3308  // describe() always starts with a tab before it prints anything.
3309  Teuchos::OSTab tab0 (out);
3310 
3311  out << "\"Tpetra::BlockCrsMatrix\":" << endl;
3312  Teuchos::OSTab tab1 (out);
3313 
3314  out << "Template parameters:" << endl;
3315  {
3316  Teuchos::OSTab tab2 (out);
3317  out << "Scalar: " << TypeNameTraits<Scalar>::name () << endl
3318  << "LO: " << TypeNameTraits<LO>::name () << endl
3319  << "GO: " << TypeNameTraits<GO>::name () << endl
3320  << "Node: " << TypeNameTraits<Node>::name () << endl;
3321  }
3322  out << "Label: \"" << this->getObjectLabel () << "\"" << endl
3323  << "Global dimensions: ["
3324  << graph_.getDomainMap ()->getGlobalNumElements () << ", "
3325  << graph_.getRangeMap ()->getGlobalNumElements () << "]" << endl;
3326 
3327  const LO blockSize = getBlockSize ();
3328  out << "Block size: " << blockSize << endl;
3329 
3330  // constituent objects
3331  if (vl >= VERB_MEDIUM) {
3332  const Teuchos::Comm<int>& comm = * (graph_.getMap ()->getComm ());
3333  const int myRank = comm.getRank ();
3334  if (myRank == 0) {
3335  out << "Row Map:" << endl;
3336  }
3337  getRowMap()->describe (out, vl);
3338 
3339  if (! getColMap ().is_null ()) {
3340  if (getColMap() == getRowMap()) {
3341  if (myRank == 0) {
3342  out << "Column Map: same as row Map" << endl;
3343  }
3344  }
3345  else {
3346  if (myRank == 0) {
3347  out << "Column Map:" << endl;
3348  }
3349  getColMap ()->describe (out, vl);
3350  }
3351  }
3352  if (! getDomainMap ().is_null ()) {
3353  if (getDomainMap () == getRowMap ()) {
3354  if (myRank == 0) {
3355  out << "Domain Map: same as row Map" << endl;
3356  }
3357  }
3358  else if (getDomainMap () == getColMap ()) {
3359  if (myRank == 0) {
3360  out << "Domain Map: same as column Map" << endl;
3361  }
3362  }
3363  else {
3364  if (myRank == 0) {
3365  out << "Domain Map:" << endl;
3366  }
3367  getDomainMap ()->describe (out, vl);
3368  }
3369  }
3370  if (! getRangeMap ().is_null ()) {
3371  if (getRangeMap () == getDomainMap ()) {
3372  if (myRank == 0) {
3373  out << "Range Map: same as domain Map" << endl;
3374  }
3375  }
3376  else if (getRangeMap () == getRowMap ()) {
3377  if (myRank == 0) {
3378  out << "Range Map: same as row Map" << endl;
3379  }
3380  }
3381  else {
3382  if (myRank == 0) {
3383  out << "Range Map: " << endl;
3384  }
3385  getRangeMap ()->describe (out, vl);
3386  }
3387  }
3388  }
3389 
3390  if (vl >= VERB_EXTREME) {
3391  // FIXME (mfh 26 May 2016) It's not nice for this method to sync
3392  // to host, since it's supposed to be const. However, that's
3393  // the easiest and least memory-intensive way to implement this
3394  // method.
3396  const_cast<this_type*> (this)->template sync<Kokkos::HostSpace> ();
3397 
3398 #ifdef HAVE_TPETRA_DEBUG
3399  TEUCHOS_TEST_FOR_EXCEPTION
3400  (this->template need_sync<Kokkos::HostSpace> (), std::logic_error,
3401  prefix << "Right after sync to host, the matrix claims that it needs "
3402  "sync to host. Please report this bug to the Tpetra developers.");
3403 #endif // HAVE_TPETRA_DEBUG
3404 
3405  const Teuchos::Comm<int>& comm = * (graph_.getMap ()->getComm ());
3406  const int myRank = comm.getRank ();
3407  const int numProcs = comm.getSize ();
3408 
3409  // Print the calling process' data to the given output stream.
3410  RCP<std::ostringstream> lclOutStrPtr (new std::ostringstream ());
3411  RCP<FancyOStream> osPtr = getFancyOStream (lclOutStrPtr);
3412  FancyOStream& os = *osPtr;
3413  os << "Process " << myRank << ":" << endl;
3414  Teuchos::OSTab tab2 (os);
3415 
3416  const map_type& meshRowMap = * (graph_.getRowMap ());
3417  const map_type& meshColMap = * (graph_.getColMap ());
3418  for (LO meshLclRow = meshRowMap.getMinLocalIndex ();
3419  meshLclRow <= meshRowMap.getMaxLocalIndex ();
3420  ++meshLclRow) {
3421  const GO meshGblRow = meshRowMap.getGlobalElement (meshLclRow);
3422  os << "Row " << meshGblRow << ": {";
3423 
3424  const LO* lclColInds = NULL;
3425  Scalar* vals = NULL;
3426  LO numInds = 0;
3427  this->getLocalRowView (meshLclRow, lclColInds, vals, numInds);
3428 
3429  for (LO k = 0; k < numInds; ++k) {
3430  const GO gblCol = meshColMap.getGlobalElement (lclColInds[k]);
3431 
3432  os << "Col " << gblCol << ": [";
3433  for (LO i = 0; i < blockSize; ++i) {
3434  for (LO j = 0; j < blockSize; ++j) {
3435  os << vals[blockSize*blockSize*k + i*blockSize + j];
3436  if (j + 1 < blockSize) {
3437  os << ", ";
3438  }
3439  }
3440  if (i + 1 < blockSize) {
3441  os << "; ";
3442  }
3443  }
3444  os << "]";
3445  if (k + 1 < numInds) {
3446  os << ", ";
3447  }
3448  }
3449  os << "}" << endl;
3450  }
3451 
3452  // Print data on Process 0. This will automatically respect the
3453  // current indentation level.
3454  if (myRank == 0) {
3455  out << lclOutStrPtr->str ();
3456  lclOutStrPtr = Teuchos::null; // clear it to save space
3457  }
3458 
3459  const int sizeTag = 1337;
3460  const int dataTag = 1338;
3461 
3462  ArrayRCP<char> recvDataBuf; // only used on Process 0
3463 
3464  // Send string sizes and data from each process in turn to
3465  // Process 0, and print on that process.
3466  for (int p = 1; p < numProcs; ++p) {
3467  if (myRank == 0) {
3468  // Receive the incoming string's length.
3469  ArrayRCP<size_t> recvSize (1);
3470  recvSize[0] = 0;
3471  RCP<CommRequest<int> > recvSizeReq =
3472  ireceive<int, size_t> (recvSize, p, sizeTag, comm);
3473  wait<int> (comm, outArg (recvSizeReq));
3474  const size_t numCharsToRecv = recvSize[0];
3475 
3476  // Allocate space for the string to receive. Reuse receive
3477  // buffer space if possible. We can do this because in the
3478  // current implementation, we only have one receive in
3479  // flight at a time. Leave space for the '\0' at the end,
3480  // in case the sender doesn't send it.
3481  if (static_cast<size_t>(recvDataBuf.size()) < numCharsToRecv + 1) {
3482  recvDataBuf.resize (numCharsToRecv + 1);
3483  }
3484  ArrayRCP<char> recvData = recvDataBuf.persistingView (0, numCharsToRecv);
3485  // Post the receive of the actual string data.
3486  RCP<CommRequest<int> > recvDataReq =
3487  ireceive<int, char> (recvData, p, dataTag, comm);
3488  wait<int> (comm, outArg (recvDataReq));
3489 
3490  // Print the received data. This will respect the current
3491  // indentation level. Make sure that the string is
3492  // null-terminated.
3493  recvDataBuf[numCharsToRecv] = '\0';
3494  out << recvDataBuf.getRawPtr ();
3495  }
3496  else if (myRank == p) { // if I am not Process 0, and my rank is p
3497  // This deep-copies the string at most twice, depending on
3498  // whether std::string reference counts internally (it
3499  // generally does, so this won't deep-copy at all).
3500  const std::string stringToSend = lclOutStrPtr->str ();
3501  lclOutStrPtr = Teuchos::null; // clear original to save space
3502 
3503  // Send the string's length to Process 0.
3504  const size_t numCharsToSend = stringToSend.size ();
3505  ArrayRCP<size_t> sendSize (1);
3506  sendSize[0] = numCharsToSend;
3507  RCP<CommRequest<int> > sendSizeReq =
3508  isend<int, size_t> (sendSize, 0, sizeTag, comm);
3509  wait<int> (comm, outArg (sendSizeReq));
3510 
3511  // Send the actual string to Process 0. We know that the
3512  // string has length > 0, so it's save to take the address
3513  // of the first entry. Make a nonowning ArrayRCP to hold
3514  // the string. Process 0 will add a null termination
3515  // character at the end of the string, after it receives the
3516  // message.
3517  ArrayRCP<const char> sendData (&stringToSend[0], 0, numCharsToSend, false);
3518  RCP<CommRequest<int> > sendDataReq =
3519  isend<int, char> (sendData, 0, dataTag, comm);
3520  wait<int> (comm, outArg (sendDataReq));
3521  }
3522  } // for each process rank p other than 0
3523  } // extreme verbosity level (print the whole matrix)
3524  }
3525 
3526  template<class Scalar, class LO, class GO, class Node>
3527  Teuchos::RCP<const Teuchos::Comm<int> >
3529  getComm() const
3530  {
3531  return graph_.getComm();
3532  }
3533 
3534  template<class Scalar, class LO, class GO, class Node>
3535  Teuchos::RCP<Node>
3537  getNode() const
3538  {
3539  return graph_.getNode();
3540 
3541  }
3542 
3543  template<class Scalar, class LO, class GO, class Node>
3547  {
3548  return graph_.getGlobalNumCols();
3549  }
3550 
3551  template<class Scalar, class LO, class GO, class Node>
3552  size_t
3555  {
3556  return graph_.getNodeNumCols();
3557  }
3558 
3559  template<class Scalar, class LO, class GO, class Node>
3560  GO
3563  {
3564  return graph_.getIndexBase();
3565  }
3566 
3567  template<class Scalar, class LO, class GO, class Node>
3571  {
3572  return graph_.getGlobalNumEntries();
3573  }
3574 
3575  template<class Scalar, class LO, class GO, class Node>
3576  size_t
3579  {
3580  return graph_.getNodeNumEntries();
3581  }
3582 
3583  template<class Scalar, class LO, class GO, class Node>
3584  size_t
3586  getNumEntriesInGlobalRow (GO globalRow) const
3587  {
3588  return graph_.getNumEntriesInGlobalRow(globalRow);
3589  }
3590 
3591  template<class Scalar, class LO, class GO, class Node>
3595  {
3596  return graph_.getGlobalNumDiags();
3597  }
3598 
3599  template<class Scalar, class LO, class GO, class Node>
3600  size_t
3603  {
3604  return graph_.getNodeNumDiags();
3605  }
3606 
3607  template<class Scalar, class LO, class GO, class Node>
3608  size_t
3611  {
3612  return graph_.getGlobalMaxNumRowEntries();
3613  }
3614 
3615  template<class Scalar, class LO, class GO, class Node>
3616  bool
3618  hasColMap() const
3619  {
3620  return graph_.hasColMap();
3621  }
3622 
3623  template<class Scalar, class LO, class GO, class Node>
3624  bool
3627  {
3628  return graph_.isLowerTriangular();
3629  }
3630 
3631  template<class Scalar, class LO, class GO, class Node>
3632  bool
3635  {
3636  return graph_.isUpperTriangular();
3637  }
3638 
3639  template<class Scalar, class LO, class GO, class Node>
3640  bool
3643  {
3644  return graph_.isLocallyIndexed();
3645  }
3646 
3647  template<class Scalar, class LO, class GO, class Node>
3648  bool
3651  {
3652  return graph_.isGloballyIndexed();
3653  }
3654 
3655  template<class Scalar, class LO, class GO, class Node>
3656  bool
3659  {
3660  return graph_.isFillComplete ();
3661  }
3662 
3663  template<class Scalar, class LO, class GO, class Node>
3664  bool
3667  {
3668  return false;
3669  }
3670 
3671 
3672  template<class Scalar, class LO, class GO, class Node>
3673  void
3675  getGlobalRowCopy (GO GlobalRow,
3676  const Teuchos::ArrayView<GO> &Indices,
3677  const Teuchos::ArrayView<Scalar> &Values,
3678  size_t &NumEntries) const
3679  {
3680  TEUCHOS_TEST_FOR_EXCEPTION(
3681  true, std::logic_error, "Tpetra::Experimental::BlockCrsMatrix::getGlobalRowCopy: "
3682  "This class doesn't support global matrix indexing.");
3683 
3684  }
3685 
3686  template<class Scalar, class LO, class GO, class Node>
3687  void
3689  getGlobalRowView (GO GlobalRow,
3690  Teuchos::ArrayView<const GO> &indices,
3691  Teuchos::ArrayView<const Scalar> &values) const
3692  {
3693  TEUCHOS_TEST_FOR_EXCEPTION(
3694  true, std::logic_error, "Tpetra::Experimental::BlockCrsMatrix::getGlobalRowView: "
3695  "This class doesn't support global matrix indexing.");
3696 
3697  }
3698 
3699  template<class Scalar, class LO, class GO, class Node>
3700  void
3702  getLocalRowView (LO LocalRow,
3703  Teuchos::ArrayView<const LO> &indices,
3704  Teuchos::ArrayView<const Scalar> &values) const
3705  {
3706  TEUCHOS_TEST_FOR_EXCEPTION(
3707  true, std::logic_error, "Tpetra::Experimental::BlockCrsMatrix::getLocalRowView: "
3708  "This class doesn't support local matrix indexing.");
3709 
3710  }
3711 
3712 
3713  template<class Scalar, class LO, class GO, class Node>
3714  void
3717  {
3718 #ifdef HAVE_TPETRA_DEBUG
3719  const char prefix[] =
3720  "Tpetra::Experimental::BlockCrsMatrix::getLocalDiagCopy: ";
3721 #endif // HAVE_TPETRA_DEBUG
3722 
3723  const size_t lclNumMeshRows = graph_.getNodeNumRows ();
3724 
3725  Kokkos::View<size_t*, device_type> diagOffsets ("diagOffsets", lclNumMeshRows);
3726  graph_.getLocalDiagOffsets (diagOffsets);
3727 
3728  // The code below works on host, so use a host View.
3729  auto diagOffsetsHost = Kokkos::create_mirror_view (diagOffsets);
3730  Kokkos::deep_copy (diagOffsetsHost, diagOffsets);
3731  // We're filling diag on host for now.
3732  diag.template modify<typename decltype (diagOffsetsHost)::memory_space> ();
3733 
3734 #ifdef HAVE_TPETRA_DEBUG
3735  TEUCHOS_TEST_FOR_EXCEPTION
3736  (this->template need_sync<Kokkos::HostSpace> (), std::runtime_error,
3737  prefix << "The matrix's data were last modified on device, but have "
3738  "not been sync'd to host. Please sync to host (by calling "
3739  "sync<Kokkos::HostSpace>() on this matrix) before calling this "
3740  "method.");
3741 #endif // HAVE_TPETRA_DEBUG
3742 
3743  // NOTE (mfh 26 May 2016) OK to const_cast here, since the host
3744  // version of the data always exists (no lazy allocation for host
3745  // data).
3747  auto vals_host_out =
3748  const_cast<this_type*> (this)->template getValues<Kokkos::HostSpace> ();
3749  Scalar* vals_host_out_raw =
3750  reinterpret_cast<Scalar*> (vals_host_out.ptr_on_device ());
3751 
3752  // TODO amk: This is a temporary measure to make the code run with Ifpack2
3753  size_t rowOffset = 0;
3754  size_t offset = 0;
3755  LO bs = getBlockSize();
3756  for(size_t r=0; r<getNodeNumRows(); r++)
3757  {
3758  // move pointer to start of diagonal block
3759  offset = rowOffset + diagOffsetsHost(r)*bs*bs;
3760  for(int b=0; b<bs; b++)
3761  {
3762  diag.replaceLocalValue(r*bs+b, vals_host_out_raw[offset+b*(bs+1)]);
3763  }
3764  // move pointer to start of next block row
3765  rowOffset += getNumEntriesInLocalRow(r)*bs*bs;
3766  }
3767 
3768  diag.template sync<memory_space> (); // sync vec of diag entries back to dev
3769  }
3770 
3771  template<class Scalar, class LO, class GO, class Node>
3772  void
3775  {
3776  TEUCHOS_TEST_FOR_EXCEPTION(
3777  true, std::logic_error, "Tpetra::Experimental::BlockCrsMatrix::leftScale: "
3778  "not implemented.");
3779 
3780  }
3781 
3782  template<class Scalar, class LO, class GO, class Node>
3783  void
3786  {
3787  TEUCHOS_TEST_FOR_EXCEPTION(
3788  true, std::logic_error, "Tpetra::Experimental::BlockCrsMatrix::rightScale: "
3789  "not implemented.");
3790 
3791  }
3792 
3793  template<class Scalar, class LO, class GO, class Node>
3794  Teuchos::RCP<const Tpetra::RowGraph<LO, GO, Node> >
3796  getGraph() const
3797  {
3798  return graphRCP_;
3799  }
3800 
3801  template<class Scalar, class LO, class GO, class Node>
3805  {
3806  TEUCHOS_TEST_FOR_EXCEPTION(
3807  true, std::logic_error, "Tpetra::Experimental::BlockCrsMatrix::getFrobeniusNorm: "
3808  "not implemented.");
3809  }
3810 
3811 } // namespace Experimental
3812 } // namespace Tpetra
3813 
3814 //
3815 // Explicit instantiation macro
3816 //
3817 // Must be expanded from within the Tpetra::Experimental namespace!
3818 //
3819 #define TPETRA_EXPERIMENTAL_BLOCKCRSMATRIX_INSTANT(S,LO,GO,NODE) \
3820  template class Experimental::BlockCrsMatrix< S, LO, GO, NODE >;
3821 
3822 #endif // TPETRA_EXPERIMENTAL_BLOCKCRSMATRIX_DEF_HPP
virtual Tpetra::RowMatrix< Scalar, LO, GO, Node >::mag_type getFrobeniusNorm() const
The Frobenius norm of the matrix.
Communication plan for data redistribution from a uniquely-owned to a (possibly) multiply-owned distr...
LO absMaxLocalValues(const LO localRowInd, const LO colInds[], const Scalar vals[], const LO numColInds) const
Like sumIntoLocalValues, but for the ABSMAX combine mode.
Namespace Tpetra contains the class and methods constituting the Tpetra library.
LO replaceLocalValuesByOffsets(const LO localRowInd, const ptrdiff_t offsets[], const Scalar vals[], const LO numOffsets) const
Like replaceLocalValues, but avoids computing row offsets.
virtual void copyAndPermute(const Tpetra::SrcDistObject &source, size_t numSameIDs, const Teuchos::ArrayView< const LO > &permuteToLIDs, const Teuchos::ArrayView< const LO > &permuteFromLIDs)
Perform copies and permutations that are local to this process.
KOKKOS_INLINE_FUNCTION void AXPY(const CoefficientType &alpha, const ViewType1 &x, const ViewType2 &y)
y := y + alpha * x (dense vector or matrix update)
void apply(const mv_type &X, mv_type &Y, Teuchos::ETransp mode=Teuchos::NO_TRANS, Scalar alpha=Teuchos::ScalarTraits< Scalar >::one(), Scalar beta=Teuchos::ScalarTraits< Scalar >::zero()) const
For this matrix A, compute Y := beta * Y + alpha * Op(A) * X.
virtual size_t getNodeNumEntries() const
The local number of stored (structurally nonzero) entries.
KOKKOS_INLINE_FUNCTION void GEMV(const CoeffType &alpha, const BlkType &A, const VecType1 &x, const VecType2 &y)
y := y + alpha * A * x (dense matrix-vector multiply)
KOKKOS_INLINE_FUNCTION void SCAL(const CoefficientType &alpha, const ViewType &x)
x := alpha*x, where x is either rank 1 (a vector) or rank 2 (a matrix).
Kokkos::View< impl_scalar_type *, Kokkos::LayoutRight, device_type, Kokkos::MemoryTraits< Kokkos::Unmanaged > > little_vec_type
The type used to access nonconst vector blocks.
size_t getNodeNumEntries() const
Returns the local number of entries in the graph.
LO sumIntoLocalValues(const LO localRowInd, const LO colInds[], const Scalar vals[], const LO numColInds) const
Sum into values at the given (mesh, i.e., block) column indices, in the given (mesh, i.e., block) row.
LO getBlockSize() const
Get the number of degrees of freedom per mesh point.
virtual size_t getNumEntriesInGlobalRow(GO globalRow) const
The current number of entries on the calling process in the specified global row. ...
global_size_t getGlobalNumRows() const
get the global number of block rows
Kokkos::View< impl_scalar_type **, Kokkos::LayoutRight, device_type, Kokkos::MemoryTraits< Kokkos::Unmanaged > > little_block_type
The type used to access nonconst matrix blocks.
void applyBlock(const BlockMultiVector< Scalar, LO, GO, Node > &X, BlockMultiVector< Scalar, LO, GO, Node > &Y, Teuchos::ETransp mode=Teuchos::NO_TRANS, const Scalar alpha=Teuchos::ScalarTraits< Scalar >::one(), const Scalar beta=Teuchos::ScalarTraits< Scalar >::zero())
Version of apply() that takes BlockMultiVector input and output.
LocalOrdinal getMaxLocalIndex() const
The maximum local index on the calling process.
Traits class for packing / unpacking data of type T, using Kokkos data structures that live in the gi...
Teuchos::RCP< const map_type > getRangeMap() const
Returns the Map associated with the domain of this graph.
One or more distributed dense vectors.
Node::device_type device_type
The Kokkos::Device specialization that this class uses.
MultiVector for multiple degrees of freedom per mesh point.
size_t getNumEntriesInLocalRow(const LO localRowInd) const
Return the number of entries in the given row on the calling process.
void deep_copy(MultiVector< DS, DL, DG, DN, dstClassic > &dst, const MultiVector< SS, SL, SG, SN, srcClassic > &src)
Copy the contents of the MultiVector src into dst.
Declaration and generic definition of traits class that tells Tpetra::CrsMatrix how to pack and unpac...
static map_type makePointMap(const map_type &meshMap, const LO blockSize)
Create and return the point Map corresponding to the given mesh Map and block size.
virtual Teuchos::RCP< const Tpetra::RowGraph< LO, GO, Node > > getGraph() const
Get the (mesh) graph.
virtual void getGlobalRowCopy(GO GlobalRow, const Teuchos::ArrayView< GO > &Indices, const Teuchos::ArrayView< Scalar > &Values, size_t &NumEntries) const
Get a copy of the given global row&#39;s entries.
virtual global_size_t getGlobalNumCols() const
The global number of columns of this matrix.
KOKKOS_INLINE_FUNCTION void FILL(const ViewType &x, const InputType &val)
Set every entry of x to val.
std::string description() const
One-line description of this object.
LO sumIntoLocalValuesByOffsets(const LO localRowInd, const ptrdiff_t offsets[], const Scalar vals[], const LO numOffsets) const
Like sumIntoLocalValues, but avoids computing row offsets.
virtual size_t getNodeNumDiags() const
The number of local diagonal entries, based on global row/column index comparisons.
LO getLocalRowView(const LO localRowInd, const LO *&colInds, Scalar *&vals, LO &numInds) const
Get a view of the (mesh, i.e., block) row, using local (mesh, i.e., block) indices.
bool locallySameAs(const Map< LocalOrdinal, GlobalOrdinal, node_type > &map) const
Is this Map locally the same as the input Map?
virtual bool isGloballyIndexed() const
Whether matrix indices are globally indexed.
LocalOrdinal getLocalElement(GlobalOrdinal globalIndex) const
The local index corresponding to the given global index.
int local_ordinal_type
Default value of LocalOrdinal template parameter.
virtual bool isLowerTriangular() const
Whether this matrix is lower triangular.
virtual global_size_t getGlobalNumDiags() const
The number of global diagonal entries, based on global row/column index comparisons.
virtual bool checkSizes(const Tpetra::SrcDistObject &source)
Compare the source and target (this) objects for compatibility.
Kokkos::View< char *, D, Kokkos::MemoryUnmanaged > output_buffer_type
The type of an output buffer of bytes.
void setAllToScalar(const Scalar &alpha)
Set all matrix entries equal to alpha.
bool isSorted() const
Whether graph indices in all rows are known to be sorted.
Teuchos::RCP< const map_type > getDomainMap() const
Returns the Map associated with the domain of this graph.
virtual size_t getNodeNumCols() const
The number of columns needed to apply the forward operator on this node.
size_t global_size_t
Global size_t object.
device_type::execution_space execution_space
The Kokkos execution space that this class uses.
Kokkos::StaticCrsGraph< LocalOrdinal, Kokkos::LayoutLeft, execution_space > local_graph_type
The type of the part of the sparse graph on each MPI process.
Insert new values that don&#39;t currently exist.
BlockCrsMatrix()
Default constructor: Makes an empty block matrix.
LO getLocalRowOffsets(const LO localRowInd, ptrdiff_t offsets[], const LO colInds[], const LO numColInds) const
Get relative offsets corresponding to the given rows, given by local row index.
void reorderedGaussSeidelCopy(MultiVector< Scalar, LO, GO, Node > &X, const MultiVector< Scalar, LO, GO, Node > &B, const MultiVector< Scalar, LO, GO, Node > &D, const Teuchos::ArrayView< LO > &rowIndices, const Scalar &dampingFactor, const ESweepDirection direction, const int numSweeps, const bool zeroInitialGuess) const
Version of reorderedGaussSeidel(), with fewer requirements on X.
KOKKOS_INLINE_FUNCTION void COPY(const ViewType1 &x, const ViewType2 &y)
Deep copy x into y, where x and y are either rank 1 (vectors) or rank 2 (matrices) with the same dime...
ESweepDirection
Sweep direction for Gauss-Seidel or Successive Over-Relaxation (SOR).
LO absMaxLocalValuesByOffsets(const LO localRowInd, const ptrdiff_t offsets[], const Scalar vals[], const LO numOffsets) const
Like sumIntoLocalValuesByOffsets, but for the ABSMAX combine mode.
LO replaceLocalValues(const LO localRowInd, const LO colInds[], const Scalar vals[], const LO numColInds) const
Replace values at the given (mesh, i.e., block) column indices, in the given (mesh, i.e., block) row.
Communication plan for data redistribution from a (possibly) multiply-owned to a uniquely-owned distr...
virtual Teuchos::RCP< Node > getNode() const
The Kokkos Node instance.
Sets up and executes a communication plan for a Tpetra DistObject.
void gaussSeidelCopy(MultiVector< Scalar, LO, GO, Node > &X, const MultiVector< Scalar, LO, GO, Node > &B, const MultiVector< Scalar, LO, GO, Node > &D, const Scalar &dampingFactor, const ESweepDirection direction, const int numSweeps, const bool zeroInitialGuess) const
Version of gaussSeidel(), with fewer requirements on X.
CombineMode
Rule for combining data in an Import or Export.
Sum new values into existing values.
size_t getNodeMaxNumRowEntries() const
The maximum number of entries across all rows/columns on this node.
Tpetra::MultiVector< Scalar, LocalOrdinal, GlobalOrdinal, Node >::mag_type mag_type
Type of a norm result.
Kokkos::View< const value_type *, D, Kokkos::MemoryUnmanaged > input_array_type
The type of an input array of value_type.
virtual void leftScale(const Vector< Scalar, LO, GO, Node > &x)
Scale the RowMatrix on the left with the given Vector x.
Kokkos::View< const char *, D, Kokkos::MemoryUnmanaged > input_buffer_type
The type of an input buffer of bytes.
Replace old value with maximum of magnitudes of old and new values.
GlobalOrdinal getGlobalElement(LocalOrdinal localIndex) const
The global index corresponding to the given local index.
Abstract base class for objects that can be the source of an Import or Export operation.
virtual size_t getGlobalMaxNumRowEntries() const
The maximum number of entries across all rows/columns on all nodes.
double scalar_type
Default value of Scalar template parameter.
virtual bool supportsRowViews() const
Whether this object implements getLocalRowView() and getGlobalRowView().
Replace existing values with new values.
void getLocalRowCopy(LO LocalRow, const Teuchos::ArrayView< LO > &Indices, const Teuchos::ArrayView< Scalar > &Values, size_t &NumEntries) const
Not implemented.
Teuchos::RCP< const map_type > getRowMap() const
get the (mesh) map for the rows of this block matrix.
Kokkos::View< const impl_scalar_type **, Kokkos::LayoutRight, device_type, Kokkos::MemoryTraits< Kokkos::Unmanaged > > const_little_block_type
The type used to access const matrix blocks.
Replace old values with zero.
virtual bool hasColMap() const
Whether this matrix has a well-defined column map.
Kokkos::Details::ArithTraits< Scalar >::val_type impl_scalar_type
The type used internally in place of Scalar.
virtual bool isUpperTriangular() const
Whether this matrix is upper triangular.
void getLocalDiagOffsets(const Kokkos::View< size_t *, device_type, Kokkos::MemoryUnmanaged > &offsets) const
Get offsets of the diagonal entries in the matrix.
virtual global_size_t getGlobalNumEntries() const
The global number of stored (structurally nonzero) entries.
void replaceLocalValue(const LocalOrdinal myRow, const Scalar &value) const
Replace current value at the specified location with specified values.
virtual void rightScale(const Vector< Scalar, LO, GO, Node > &x)
Scale the RowMatrix on the right with the given Vector x.
virtual Teuchos::RCP< const Teuchos::Comm< int > > getComm() const
The communicator over which this matrix is distributed.
virtual void getGlobalRowView(GO GlobalRow, Teuchos::ArrayView< const GO > &indices, Teuchos::ArrayView< const Scalar > &values) const
Get a constant, nonpersisting, globally indexed view of the given row of the matrix.
virtual GO getIndexBase() const
The index base for global indices in this matrix.
void localGaussSeidel(const BlockMultiVector< Scalar, LO, GO, Node > &Residual, BlockMultiVector< Scalar, LO, GO, Node > &Solution, const Kokkos::View< impl_scalar_type ***, device_type, Kokkos::MemoryUnmanaged > &D_inv, const Scalar &omega, const ESweepDirection direction) const
Local Gauss-Seidel solve, given a factorized diagonal.
void getLocalDiagCopy(const Kokkos::View< impl_scalar_type ***, device_type, Kokkos::MemoryUnmanaged > &diag, const Kokkos::View< const size_t *, device_type, Kokkos::MemoryUnmanaged > &offsets) const
Variant of getLocalDiagCopy() that uses precomputed offsets and puts diagonal blocks in a 3-D Kokkos:...
little_vec_type::HostMirror getLocalBlock(const LO localRowIndex, const LO colIndex) const
Get a host view of the degrees of freedom at the given mesh point.
Kokkos::View< value_type *, D, Kokkos::MemoryUnmanaged > output_array_type
The type of an output array of value_type.
A distributed dense vector.
Sparse matrix whose entries are small dense square blocks, all of the same dimensions.
device_type::memory_space memory_space
The Kokkos memory space that this class uses.
Teuchos::RCP< const map_type > getDomainMap() const
Get the (point) domain Map of this matrix.
virtual bool isFillComplete() const
Whether fillComplete() has been called.
BlockMultiVector< Scalar, LO, GO, Node >::impl_scalar_type impl_scalar_type
The implementation type of entries in the matrix.
Teuchos::RCP< const map_type > getColMap() const
get the (mesh) map for the columns of this block matrix.
Declaration of Tpetra::Experimental::BlockCrsMatrix.
bool isNodeLocalElement(LocalOrdinal localIndex) const
Whether the given local index is valid for this Map on the calling process.
Teuchos::RCP< const map_type > getRangeMap() const
Get the (point) range Map of this matrix.
void describe(Teuchos::FancyOStream &out, const Teuchos::EVerbosityLevel verbLevel) const
Print a description of this object to the given output stream.
LocalOrdinal getMinLocalIndex() const
The minimum local index.
size_t getNodeNumRows() const
get the local number of block rows
virtual bool isLocallyIndexed() const
Whether matrix indices are locally indexed.
local_graph_type getLocalGraph() const
Get the local graph.
LO getNumVectors() const
Get the number of columns (vectors) in the BlockMultiVector.