Tpetra parallel linear algebra  Version of the Day
Tpetra_MultiVector_def.hpp
Go to the documentation of this file.
1 // @HEADER
2 // ***********************************************************************
3 //
4 // Tpetra: Templated Linear Algebra Services Package
5 // Copyright (2008) Sandia Corporation
6 //
7 // Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
8 // the U.S. Government retains certain rights in this software.
9 //
10 // Redistribution and use in source and binary forms, with or without
11 // modification, are permitted provided that the following conditions are
12 // met:
13 //
14 // 1. Redistributions of source code must retain the above copyright
15 // notice, this list of conditions and the following disclaimer.
16 //
17 // 2. Redistributions in binary form must reproduce the above copyright
18 // notice, this list of conditions and the following disclaimer in the
19 // documentation and/or other materials provided with the distribution.
20 //
21 // 3. Neither the name of the Corporation nor the names of the
22 // contributors may be used to endorse or promote products derived from
23 // this software without specific prior written permission.
24 //
25 // THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
26 // EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
27 // IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
28 // PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
29 // CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
30 // EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
31 // PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
32 // PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
33 // LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
34 // NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
35 // SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
36 //
37 // Questions? Contact Michael A. Heroux (maherou@sandia.gov)
38 //
39 // ************************************************************************
40 // @HEADER
41 
42 #ifndef TPETRA_MULTIVECTOR_DEF_HPP
43 #define TPETRA_MULTIVECTOR_DEF_HPP
44 
52 
53 #include "Tpetra_Util.hpp"
54 #include "Tpetra_Vector.hpp"
55 #include "Tpetra_Details_MultiVectorDistObjectKernels.hpp"
56 #include "Tpetra_Details_gathervPrint.hpp"
57 #include "Tpetra_KokkosRefactor_Details_MultiVectorDistObjectKernels.hpp"
58 
59 #include "KokkosCompat_View.hpp"
60 #include "Kokkos_MV_GEMM.hpp"
61 #include "Kokkos_Blas1_MV.hpp"
62 #include "Kokkos_Random.hpp"
63 
64 #ifdef HAVE_TPETRA_INST_FLOAT128
65 namespace Kokkos {
66  // FIXME (mfh 04 Sep 2015) Just a stub for now!
67  template<class Generator>
68  struct rand<Generator, __float128> {
69  static KOKKOS_INLINE_FUNCTION __float128 max ()
70  {
71  return static_cast<__float128> (1.0);
72  }
73  static KOKKOS_INLINE_FUNCTION __float128
74  draw (Generator& gen)
75  {
76  // Half the smallest normalized double, is the scaling factor of
77  // the lower-order term in the double-double representation.
78  const __float128 scalingFactor =
79  static_cast<__float128> (std::numeric_limits<double>::min ()) /
80  static_cast<__float128> (2.0);
81  const __float128 higherOrderTerm = static_cast<__float128> (gen.drand ());
82  const __float128 lowerOrderTerm =
83  static_cast<__float128> (gen.drand ()) * scalingFactor;
84  return higherOrderTerm + lowerOrderTerm;
85  }
86  static KOKKOS_INLINE_FUNCTION __float128
87  draw (Generator& gen, const __float128& range)
88  {
89  // FIXME (mfh 05 Sep 2015) Not sure if this is right.
90  const __float128 scalingFactor =
91  static_cast<__float128> (std::numeric_limits<double>::min ()) /
92  static_cast<__float128> (2.0);
93  const __float128 higherOrderTerm =
94  static_cast<__float128> (gen.drand (range));
95  const __float128 lowerOrderTerm =
96  static_cast<__float128> (gen.drand (range)) * scalingFactor;
97  return higherOrderTerm + lowerOrderTerm;
98  }
99  static KOKKOS_INLINE_FUNCTION __float128
100  draw (Generator& gen, const __float128& start, const __float128& end)
101  {
102  // FIXME (mfh 05 Sep 2015) Not sure if this is right.
103  const __float128 scalingFactor =
104  static_cast<__float128> (std::numeric_limits<double>::min ()) /
105  static_cast<__float128> (2.0);
106  const __float128 higherOrderTerm =
107  static_cast<__float128> (gen.drand (start, end));
108  const __float128 lowerOrderTerm =
109  static_cast<__float128> (gen.drand (start, end)) * scalingFactor;
110  return higherOrderTerm + lowerOrderTerm;
111  }
112  };
113 } // namespace Kokkos
114 #endif // HAVE_TPETRA_INST_FLOAT128
115 
116 namespace { // (anonymous)
117 
132  template<class ST, class LO, class GO, class NT>
134  allocDualView (const size_t lclNumRows,
135  const size_t numCols,
136  const bool zeroOut = true,
137  const bool allowPadding = false)
138  {
139  using Kokkos::AllowPadding;
140  using Kokkos::view_alloc;
141  using Kokkos::WithoutInitializing;
142  typedef typename Tpetra::MultiVector<ST, LO, GO, NT>::dual_view_type dual_view_type;
143  typedef typename dual_view_type::t_dev dev_view_type;
144  // This needs to be a string and not a char*, if given as an
145  // argument to Kokkos::view_alloc. This is because view_alloc
146  // also allows a raw pointer as its first argument. See
147  // https://github.com/kokkos/kokkos/issues/434.
148  const std::string label ("MV::DualView");
149 
150  // NOTE (mfh 18 Feb 2015, 12 Apr 2015, 22 Sep 2016) Our separate
151  // creation of the DualView's Views works around
152  // Kokkos::DualView's current inability to accept an
153  // AllocationProperties initial argument (as Kokkos::View does).
154  // However, the work-around is harmless, since it does what the
155  // (currently nonexistent) equivalent DualView constructor would
156  // have done anyway.
157 
158  dev_view_type d_view;
159  if (zeroOut) {
160  if (allowPadding) {
161  d_view = dev_view_type (view_alloc (label, AllowPadding),
162  lclNumRows, numCols);
163  }
164  else {
165  d_view = dev_view_type (view_alloc (label),
166  lclNumRows, numCols);
167  }
168  }
169  else {
170  if (allowPadding) {
171  d_view = dev_view_type (view_alloc (label,
172  WithoutInitializing,
173  AllowPadding),
174  lclNumRows, numCols);
175  }
176  else {
177  d_view = dev_view_type (view_alloc (label, WithoutInitializing),
178  lclNumRows, numCols);
179  }
180 #ifdef HAVE_TPETRA_DEBUG
181  // Filling with NaN is a cheap and effective way to tell if
182  // downstream code is trying to use a MultiVector's data without
183  // them having been initialized. ArithTraits lets us call nan()
184  // even if the scalar type doesn't define it; it just returns some
185  // undefined value in the latter case. This won't hurt anything
186  // because by setting zeroOut=false, users already agreed that
187  // they don't care about the contents of the MultiVector.
188  const ST nan = Kokkos::Details::ArithTraits<ST>::nan ();
189  KokkosBlas::fill (d_view, nan);
190 #endif // HAVE_TPETRA_DEBUG
191  }
192 #ifdef HAVE_TPETRA_DEBUG
193  TEUCHOS_TEST_FOR_EXCEPTION
194  (static_cast<size_t> (d_view.dimension_0 ()) != lclNumRows ||
195  static_cast<size_t> (d_view.dimension_1 ()) != numCols, std::logic_error,
196  "allocDualView: d_view's dimensions actual dimensions do not match "
197  "requested dimensions. d_view is " << d_view.dimension_0 () << " x " <<
198  d_view.dimension_1 () << "; requested " << lclNumRows << " x " << numCols
199  << ". Please report this bug to the Tpetra developers.");
200 #endif // HAVE_TPETRA_DEBUG
201 
202  typename dual_view_type::t_host h_view = Kokkos::create_mirror_view (d_view);
203 
204  dual_view_type dv (d_view, h_view);
205  // Whether or not the user cares about the initial contents of the
206  // MultiVector, the device and host views are out of sync. We
207  // prefer to work in device memory. The way to ensure this
208  // happens is to mark the device view as modified.
209  dv.template modify<typename dev_view_type::memory_space> ();
210 
211  return dv;
212  }
213 
214  // Convert 1-D Teuchos::ArrayView to an unmanaged 1-D host Kokkos::View.
215  //
216  // T: The type of the entries of the View.
217  // ExecSpace: The Kokkos execution space.
218  template<class T, class ExecSpace>
219  struct MakeUnmanagedView {
220  // The 'false' part of the branch carefully ensures that this
221  // won't attempt to use a host execution space that hasn't been
222  // initialized. For example, if Kokkos::OpenMP is disabled and
223  // Kokkos::Threads is enabled, the latter is always the default
224  // execution space of Kokkos::HostSpace, even when ExecSpace is
225  // Kokkos::Serial. That's why we go through the trouble of asking
226  // Kokkos::DualView what _its_ space is. That seems to work
227  // around this default execution space issue.
228  //
229  // NOTE (mfh 29 Jan 2016): See kokkos/kokkos#178 for why we use
230  // a memory space, rather than an execution space, as the first
231  // argument of VerifyExecutionCanAccessMemorySpace.
232  typedef typename Kokkos::Impl::if_c<
233  Kokkos::Impl::VerifyExecutionCanAccessMemorySpace<
234  typename ExecSpace::memory_space,
235  Kokkos::HostSpace>::value,
236  typename ExecSpace::device_type,
237  typename Kokkos::DualView<T*, ExecSpace>::host_mirror_space>::type host_exec_space;
238  typedef Kokkos::LayoutLeft array_layout;
239  typedef Kokkos::View<T*, array_layout, host_exec_space,
240  Kokkos::MemoryUnmanaged> view_type;
241 
242  static view_type getView (const Teuchos::ArrayView<T>& x_in)
243  {
244  const size_t numEnt = static_cast<size_t> (x_in.size ());
245  if (numEnt == 0) {
246  return view_type ();
247  } else {
248  return view_type (x_in.getRawPtr (), numEnt);
249  }
250  }
251  };
252 
253  // mfh 14 Apr 2015: Work-around for bug in Kokkos::subview, where
254  // taking a subview of a 0 x N DualView incorrectly always results
255  // in a 0 x 0 DualView.
256  template<class DualViewType>
257  DualViewType
258  takeSubview (const DualViewType& X,
259 //We will move the ALL_t to the Kokkos namespace eventually, this is a workaround for testing the new View implementation
260 #ifdef KOKKOS_USING_EXPERIMENTAL_VIEW
261  const Kokkos::Experimental::Impl::ALL_t&,
262 #else
263  const Kokkos::ALL&,
264 #endif
265  const std::pair<size_t, size_t>& colRng)
266  {
267  if (X.dimension_0 () == 0 && X.dimension_1 () != 0) {
268  return DualViewType ("MV::DualView", 0, colRng.second - colRng.first);
269  }
270  else {
271  return subview (X, Kokkos::ALL (), colRng);
272  }
273  }
274 
275  // mfh 14 Apr 2015: Work-around for bug in Kokkos::subview, where
276  // taking a subview of a 0 x N DualView incorrectly always results
277  // in a 0 x 0 DualView.
278  template<class DualViewType>
279  DualViewType
280  takeSubview (const DualViewType& X,
281  const std::pair<size_t, size_t>& rowRng,
282  const std::pair<size_t, size_t>& colRng)
283  {
284  if (X.dimension_0 () == 0 && X.dimension_1 () != 0) {
285  return DualViewType ("MV::DualView", 0, colRng.second - colRng.first);
286  }
287  else {
288  return subview (X, rowRng, colRng);
289  }
290  }
291 } // namespace (anonymous)
292 
293 
294 namespace Tpetra {
295 
296  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node, const bool classic>
297  bool
298  MultiVector<Scalar, LocalOrdinal, GlobalOrdinal, Node, classic>::
299  vectorIndexOutOfRange (const size_t VectorIndex) const {
300  return (VectorIndex < 1 && VectorIndex != 0) || VectorIndex >= getNumVectors();
301  }
302 
303  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node, const bool classic>
306  base_type (Teuchos::rcp (new map_type ()))
307  {}
308 
309  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node, const bool classic>
311  MultiVector (const Teuchos::RCP<const map_type>& map,
312  const size_t numVecs,
313  const bool zeroOut) : /* default is true */
314  base_type (map)
315  {
316  const size_t lclNumRows = this->getLocalLength ();
317  view_ = allocDualView<Scalar, LocalOrdinal, GlobalOrdinal, Node> (lclNumRows, numVecs, zeroOut);
318  origView_ = view_;
319  }
320 
321  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node, const bool classic>
324  base_type (source),
325  view_ (source.view_),
326  origView_ (source.origView_),
327  whichVectors_ (source.whichVectors_)
328  {}
329 
330  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node, const bool classic>
333  const Teuchos::DataAccess copyOrView) :
334  base_type (source),
335  view_ (source.view_),
336  origView_ (source.origView_),
337  whichVectors_ (source.whichVectors_)
338  {
340  const char tfecfFuncName[] = "MultiVector(const MultiVector&, "
341  "const Teuchos::DataAccess): ";
342 
343  if (copyOrView == Teuchos::Copy) {
344  // Reuse the conveniently already existing function that creates
345  // a deep copy.
346  MV cpy = createCopy (source);
347  this->view_ = cpy.view_;
348  this->origView_ = cpy.origView_;
349  this->whichVectors_ = cpy.whichVectors_;
350  }
351  else if (copyOrView == Teuchos::View) {
352  }
353  else {
354  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
355  true, std::invalid_argument, "Second argument 'copyOrView' has an "
356  "invalid value " << copyOrView << ". Valid values include "
357  "Teuchos::Copy = " << Teuchos::Copy << " and Teuchos::View = "
358  << Teuchos::View << ".");
359  }
360  }
361 
362  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node, const bool classic>
364  MultiVector (const Teuchos::RCP<const map_type>& map,
365  const dual_view_type& view) :
366  base_type (map),
367  view_ (view),
368  origView_ (view)
369  {
370  const char tfecfFuncName[] = "MultiVector(map,view): ";
371 
372  // Get stride of view: if second dimension is 0, the
373  // stride might be 0, so take view_dimension instead.
374  size_t stride[8];
375  origView_.stride (stride);
376  const size_t LDA = (origView_.dimension_1 () > 1) ? stride[1] :
377  origView_.dimension_0 ();
378  const size_t lclNumRows = this->getLocalLength (); // comes from the Map
379  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
380  LDA < lclNumRows, std::invalid_argument, "The input Kokkos::DualView's "
381  "column stride LDA = " << LDA << " < getLocalLength() = " << lclNumRows
382  << ". This may also mean that the input view's first dimension (number "
383  "of rows = " << view.dimension_0 () << ") does not not match the number "
384  "of entries in the Map on the calling process.");
385  }
386 
387 
388  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node, const bool classic>
390  MultiVector (const Teuchos::RCP<const map_type>& map,
391  const typename dual_view_type::t_dev& d_view) :
392  base_type (map)
393  {
394  using Teuchos::ArrayRCP;
395  using Teuchos::RCP;
396  const char tfecfFuncName[] = "MultiVector(map,d_view): ";
397 
398  // Get stride of view: if second dimension is 0, the stride might
399  // be 0, so take view_dimension instead.
400  size_t stride[8];
401  d_view.stride (stride);
402  const size_t LDA = (d_view.dimension_1 () > 1) ? stride[1] :
403  d_view.dimension_0 ();
404  const size_t lclNumRows = this->getLocalLength (); // comes from the Map
405  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
406  LDA < lclNumRows, std::invalid_argument, "The input Kokkos::View's "
407  "column stride LDA = " << LDA << " < getLocalLength() = " << lclNumRows
408  << ". This may also mean that the input view's first dimension (number "
409  "of rows = " << d_view.dimension_0 () << ") does not not match the "
410  "number of entries in the Map on the calling process.");
411 
412  // The difference between create_mirror and create_mirror_view, is
413  // that the latter copies to host. We don't necessarily want to
414  // do that; we just want to allocate the memory.
415  view_ = dual_view_type (d_view, Kokkos::create_mirror (d_view));
416  // The user gave us a device view. We take it as canonical, which
417  // means we mark it as "modified," so that the next sync will
418  // synchronize it with the host view.
419  this->template modify<device_type> ();
420  origView_ = view_;
421  }
422 
423  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node, const bool classic>
425  MultiVector (const Teuchos::RCP<const map_type>& map,
426  const dual_view_type& view,
427  const dual_view_type& origView) :
428  base_type (map),
429  view_ (view),
430  origView_ (origView)
431  {
432  const char tfecfFuncName[] = "MultiVector(map,view,origView): ";
433 
434  // Get stride of view: if second dimension is 0, the
435  // stride might be 0, so take view_dimension instead.
436  size_t stride[8];
437  origView_.stride (stride);
438  const size_t LDA = (origView_.dimension_1 () > 1) ? stride[1] :
439  origView_.dimension_0 ();
440  const size_t lclNumRows = this->getLocalLength (); // comes from the Map
441  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
442  LDA < lclNumRows, std::invalid_argument, "The input Kokkos::DualView's "
443  "column stride LDA = " << LDA << " < getLocalLength() = " << lclNumRows
444  << ". This may also mean that the input origView's first dimension (number "
445  "of rows = " << origView.dimension_0 () << ") does not not match the number "
446  "of entries in the Map on the calling process.");
447  }
448 
449 
450  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node, const bool classic>
452  MultiVector (const Teuchos::RCP<const map_type>& map,
453  const dual_view_type& view,
454  const Teuchos::ArrayView<const size_t>& whichVectors) :
455  base_type (map),
456  view_ (view),
457  origView_ (view),
458  whichVectors_ (whichVectors.begin (), whichVectors.end ())
459  {
460  using Kokkos::ALL;
461  using Kokkos::subview;
462  const char tfecfFuncName[] = "MultiVector(map,view,whichVectors): ";
463 
464  const size_t lclNumRows = map.is_null () ? size_t (0) :
465  map->getNodeNumElements ();
466  // Check dimensions of the input DualView. We accept that Kokkos
467  // might not allow construction of a 0 x m (Dual)View with m > 0,
468  // so we only require the number of rows to match if the
469  // (Dual)View has more than zero columns. Likewise, we only
470  // require the number of columns to match if the (Dual)View has
471  // more than zero rows.
472  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
473  view.dimension_1 () != 0 && static_cast<size_t> (view.dimension_0 ()) < lclNumRows,
474  std::invalid_argument, "view.dimension_0() = " << view.dimension_0 ()
475  << " < map->getNodeNumElements() = " << lclNumRows << ".");
476  if (whichVectors.size () != 0) {
477  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
478  view.dimension_1 () != 0 && view.dimension_1 () == 0,
479  std::invalid_argument, "view.dimension_1() = 0, but whichVectors.size()"
480  " = " << whichVectors.size () << " > 0.");
481  size_t maxColInd = 0;
482  typedef Teuchos::ArrayView<const size_t>::size_type size_type;
483  for (size_type k = 0; k < whichVectors.size (); ++k) {
484  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
485  whichVectors[k] == Teuchos::OrdinalTraits<size_t>::invalid (),
486  std::invalid_argument, "whichVectors[" << k << "] = "
487  "Teuchos::OrdinalTraits<size_t>::invalid().");
488  maxColInd = std::max (maxColInd, whichVectors[k]);
489  }
490  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
491  view.dimension_1 () != 0 && static_cast<size_t> (view.dimension_1 ()) <= maxColInd,
492  std::invalid_argument, "view.dimension_1() = " << view.dimension_1 ()
493  << " <= max(whichVectors) = " << maxColInd << ".");
494  }
495 
496  // Get stride of view: if second dimension is 0, the
497  // stride might be 0, so take view_dimension instead.
498  size_t stride[8];
499  origView_.stride (stride);
500  const size_t LDA = (origView_.dimension_1 () > 1) ? stride[1] :
501  origView_.dimension_0 ();
502  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
503  LDA < lclNumRows, std::invalid_argument,
504  "LDA = " << LDA << " < this->getLocalLength() = " << lclNumRows << ".");
505 
506  if (whichVectors.size () == 1) {
507  // If whichVectors has only one entry, we don't need to bother
508  // with nonconstant stride. Just shift the view over so it
509  // points to the desired column.
510  //
511  // NOTE (mfh 10 May 2014) This is a special case where we set
512  // origView_ just to view that one column, not all of the
513  // original columns. This ensures that the use of origView_ in
514  // offsetView works correctly.
515  const std::pair<size_t, size_t> colRng (whichVectors[0],
516  whichVectors[0] + 1);
517  view_ = takeSubview (view_, ALL (), colRng);
518  origView_ = takeSubview (origView_, ALL (), colRng);
519  // whichVectors_.size() == 0 means "constant stride."
520  whichVectors_.clear ();
521  }
522  }
523 
524  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node, const bool classic>
526  MultiVector (const Teuchos::RCP<const map_type>& map,
527  const dual_view_type& view,
528  const dual_view_type& origView,
529  const Teuchos::ArrayView<const size_t>& whichVectors) :
530  base_type (map),
531  view_ (view),
532  origView_ (origView),
533  whichVectors_ (whichVectors.begin (), whichVectors.end ())
534  {
535  using Kokkos::ALL;
536  using Kokkos::subview;
537  const char tfecfFuncName[] = "MultiVector(map,view,origView,whichVectors): ";
538 
539  const size_t lclNumRows = this->getLocalLength ();
540  // Check dimensions of the input DualView. We accept that Kokkos
541  // might not allow construction of a 0 x m (Dual)View with m > 0,
542  // so we only require the number of rows to match if the
543  // (Dual)View has more than zero columns. Likewise, we only
544  // require the number of columns to match if the (Dual)View has
545  // more than zero rows.
546  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
547  view.dimension_1 () != 0 && static_cast<size_t> (view.dimension_0 ()) < lclNumRows,
548  std::invalid_argument, "view.dimension_0() = " << view.dimension_0 ()
549  << " < map->getNodeNumElements() = " << lclNumRows << ".");
550  if (whichVectors.size () != 0) {
551  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
552  view.dimension_1 () != 0 && view.dimension_1 () == 0,
553  std::invalid_argument, "view.dimension_1() = 0, but whichVectors.size()"
554  " = " << whichVectors.size () << " > 0.");
555  size_t maxColInd = 0;
556  typedef Teuchos::ArrayView<const size_t>::size_type size_type;
557  for (size_type k = 0; k < whichVectors.size (); ++k) {
558  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
559  whichVectors[k] == Teuchos::OrdinalTraits<size_t>::invalid (),
560  std::invalid_argument, "whichVectors[" << k << "] = "
561  "Teuchos::OrdinalTraits<size_t>::invalid().");
562  maxColInd = std::max (maxColInd, whichVectors[k]);
563  }
564  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
565  view.dimension_1 () != 0 && static_cast<size_t> (view.dimension_1 ()) <= maxColInd,
566  std::invalid_argument, "view.dimension_1() = " << view.dimension_1 ()
567  << " <= max(whichVectors) = " << maxColInd << ".");
568  }
569  // Get stride of view: if second dimension is 0, the
570  // stride might be 0, so take view_dimension instead.
571  size_t stride[8];
572  origView_.stride (stride);
573  const size_t LDA = (origView_.dimension_1 () > 1) ? stride[1] :
574  origView_.dimension_0 ();
575  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
576  LDA < lclNumRows, std::invalid_argument, "Input DualView's column stride"
577  " = " << LDA << " < this->getLocalLength() = " << lclNumRows << ".");
578 
579  if (whichVectors.size () == 1) {
580  // If whichVectors has only one entry, we don't need to bother
581  // with nonconstant stride. Just shift the view over so it
582  // points to the desired column.
583  //
584  // NOTE (mfh 10 May 2014) This is a special case where we set
585  // origView_ just to view that one column, not all of the
586  // original columns. This ensures that the use of origView_ in
587  // offsetView works correctly.
588  const std::pair<size_t, size_t> colRng (whichVectors[0],
589  whichVectors[0] + 1);
590  view_ = takeSubview (view_, ALL (), colRng);
591  origView_ = takeSubview (origView_, ALL (), colRng);
592  // whichVectors_.size() == 0 means "constant stride."
593  whichVectors_.clear ();
594  }
595  }
596 
597  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node, const bool classic>
599  MultiVector (const Teuchos::RCP<const map_type>& map,
600  const Teuchos::ArrayView<const Scalar>& data,
601  const size_t LDA,
602  const size_t numVecs) :
603  base_type (map)
604  {
605  typedef LocalOrdinal LO;
606  typedef GlobalOrdinal GO;
607  const char tfecfFuncName[] = "MultiVector(map,data,LDA,numVecs): ";
608 
609  // Deep copy constructor, constant stride (NO whichVectors_).
610  // There is no need for a deep copy constructor with nonconstant stride.
611 
612  const size_t lclNumRows =
613  map.is_null () ? size_t (0) : map->getNodeNumElements ();
614  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
615  (LDA < lclNumRows, std::invalid_argument, "LDA = " << LDA << " < "
616  "map->getNodeNumElements() = " << lclNumRows << ".");
617  if (numVecs != 0) {
618  const size_t minNumEntries = LDA * (numVecs - 1) + lclNumRows;
619  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
620  (static_cast<size_t> (data.size ()) < minNumEntries,
621  std::invalid_argument, "Input Teuchos::ArrayView does not have enough "
622  "entries, given the input Map and number of vectors in the MultiVector."
623  " data.size() = " << data.size () << " < (LDA*(numVecs-1)) + "
624  "map->getNodeNumElements () = " << minNumEntries << ".");
625  }
626 
627  this->view_ = allocDualView<Scalar, LO, GO, Node> (lclNumRows, numVecs);
628  this->template modify<device_type> ();
629  auto X_out = this->template getLocalView<device_type> ();
630  origView_ = view_;
631 
632  // Make an unmanaged host Kokkos::View of the input data. First
633  // create a View (X_in_orig) with the original stride. Then,
634  // create a subview (X_in) with the right number of columns.
635  const impl_scalar_type* const X_in_raw =
636  reinterpret_cast<const impl_scalar_type*> (data.getRawPtr ());
637  Kokkos::View<const impl_scalar_type**,
638  Kokkos::LayoutLeft,
639  Kokkos::HostSpace,
640  Kokkos::MemoryUnmanaged> X_in_orig (X_in_raw, LDA, numVecs);
641  const Kokkos::pair<size_t, size_t> rowRng (0, lclNumRows);
642  auto X_in = Kokkos::subview (X_in_orig, rowRng, Kokkos::ALL ());
643 
644  // If LDA != X_out's column stride, then we need to copy one
645  // column at a time; Kokkos::deep_copy refuses to work in that
646  // case.
647  size_t outStrides[8];
648  X_out.stride (outStrides);
649  const size_t outStride = (X_out.dimension_1 () > 1) ? outStrides[1] :
650  X_out.dimension_0 ();
651  if (LDA == outStride) { // strides are the same; deep_copy once
652  // This only works because MultiVector uses LayoutLeft.
653  // We would need a custom copy functor otherwise.
654  Kokkos::deep_copy (X_out, X_in);
655  }
656  else { // strides differ; copy one column at a time
657  typedef decltype (Kokkos::subview (X_out, Kokkos::ALL (), 0))
658  out_col_view_type;
659  typedef decltype (Kokkos::subview (X_in, Kokkos::ALL (), 0))
660  in_col_view_type;
661  for (size_t j = 0; j < numVecs; ++j) {
662  out_col_view_type X_out_j = Kokkos::subview (X_out, Kokkos::ALL (), j);
663  in_col_view_type X_in_j = Kokkos::subview (X_in, Kokkos::ALL (), j);
664  Kokkos::deep_copy (X_out_j, X_in_j);
665  }
666  }
667  }
668 
669  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node, const bool classic>
671  MultiVector (const Teuchos::RCP<const map_type>& map,
672  const Teuchos::ArrayView<const Teuchos::ArrayView<const Scalar> >& ArrayOfPtrs,
673  const size_t numVecs) :
674  base_type (map)
675  {
676  typedef impl_scalar_type IST;
677  typedef LocalOrdinal LO;
678  typedef GlobalOrdinal GO;
679  const char tfecfFuncName[] = "MultiVector(map,ArrayOfPtrs,numVecs): ";
680 
681  const size_t lclNumRows =
682  map.is_null () ? size_t (0) : map->getNodeNumElements ();
683  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
684  (numVecs < 1 || numVecs != static_cast<size_t> (ArrayOfPtrs.size ()),
685  std::runtime_error, "Either numVecs (= " << numVecs << ") < 1, or "
686  "ArrayOfPtrs.size() (= " << ArrayOfPtrs.size () << ") != numVecs.");
687  for (size_t j = 0; j < numVecs; ++j) {
688  Teuchos::ArrayView<const Scalar> X_j_av = ArrayOfPtrs[j];
689  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
690  static_cast<size_t> (X_j_av.size ()) < lclNumRows,
691  std::invalid_argument, "ArrayOfPtrs[" << j << "].size() = "
692  << X_j_av.size () << " < map->getNodeNumElements() = " << lclNumRows
693  << ".");
694  }
695 
696  view_ = allocDualView<Scalar, LO, GO, Node> (lclNumRows, numVecs);
697  this->template modify<device_type> ();
698  auto X_out = this->getLocalView<device_type> ();
699 
700  // Make sure that the type of a single input column has the same
701  // array layout as each output column does, so we can deep_copy.
702  typedef typename decltype (X_out)::array_layout array_layout;
703  typedef typename Kokkos::View<const IST*,
704  array_layout,
705  Kokkos::HostSpace,
706  Kokkos::MemoryUnmanaged> input_col_view_type;
707 
708  const std::pair<size_t, size_t> rowRng (0, lclNumRows);
709  for (size_t j = 0; j < numVecs; ++j) {
710  Teuchos::ArrayView<const IST> X_j_av =
711  Teuchos::av_reinterpret_cast<const IST> (ArrayOfPtrs[j]);
712  input_col_view_type X_j_in (X_j_av.getRawPtr (), lclNumRows);
713  auto X_j_out = Kokkos::subview (X_out, rowRng, j);
714  Kokkos::deep_copy (X_j_out, X_j_in);
715  }
716  origView_ = view_;
717  }
718 
719  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node, const bool classic>
722 
723  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node, const bool classic>
726  return whichVectors_.empty ();
727  }
728 
729  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node, const bool classic>
730  size_t
733  {
734  if (this->getMap ().is_null ()) { // possible, due to replaceMap().
735  return static_cast<size_t> (0);
736  } else {
737  return this->getMap ()->getNodeNumElements ();
738  }
739  }
740 
741  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node, const bool classic>
745  {
746  if (this->getMap ().is_null ()) { // possible, due to replaceMap().
747  return static_cast<size_t> (0);
748  } else {
749  return this->getMap ()->getGlobalNumElements ();
750  }
751  }
752 
753  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node, const bool classic>
754  size_t
756  getStride () const
757  {
758  if (isConstantStride ()) {
759  // Get stride of view: if second dimension is 0, the
760  // stride might be 0, so take view_dimension instead.
761  size_t stride[8];
762  origView_.stride (stride);
763  const size_t LDA = (origView_.dimension_1 () > 1) ? stride[1] :
764  origView_.dimension_0 ();
765  return LDA;
766  }
767  else {
768  return static_cast<size_t> (0);
769  }
770  }
771 
772  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node, const bool classic>
773  bool
775  checkSizes (const SrcDistObject& sourceObj)
776  {
777  // Check whether the source object is a MultiVector. If not, then
778  // we can't even compare sizes, so it's definitely not OK to
779  // Import or Export from it.
781  const this_type* src = dynamic_cast<const this_type*> (&sourceObj);
782  if (src == NULL) {
783  return false;
784  } else {
785  // The target of the Import or Export calls checkSizes() in
786  // DistObject::doTransfer(). By that point, we've already
787  // constructed an Import or Export object using the two
788  // multivectors' Maps, which means that (hopefully) we've
789  // already checked other attributes of the multivectors. Thus,
790  // all we need to do here is check the number of columns.
791  return src->getNumVectors () == this->getNumVectors ();
792  }
793  }
794 
795  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node, const bool classic>
796  size_t
799  return this->getNumVectors ();
800  }
801 
802  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node, const bool classic>
803  void
805  copyAndPermuteNew (const SrcDistObject& sourceObj,
806  const size_t numSameIDs,
807  const Kokkos::DualView<const LocalOrdinal*, device_type>& permuteToLIDs,
808  const Kokkos::DualView<const LocalOrdinal*, device_type>& permuteFromLIDs)
809  {
811  using KokkosRefactor::Details::permute_array_multi_column;
812  using KokkosRefactor::Details::permute_array_multi_column_variable_stride;
813  using Kokkos::Compat::create_const_view;
814  typedef typename dual_view_type::t_dev::memory_space DMS;
815  typedef typename dual_view_type::t_host::memory_space HMS;
817  const char tfecfFuncName[] = "copyAndPermuteNew: ";
818 
819  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
820  permuteToLIDs.dimension_0 () != permuteFromLIDs.dimension_0 (),
821  std::runtime_error, "permuteToLIDs.dimension_0() = "
822  << permuteToLIDs.dimension_0 () << " != permuteFromLIDs.dimension_0() = "
823  << permuteFromLIDs.dimension_0 () << ".");
824 
825  // We've already called checkSizes(), so this cast must succeed.
826  const MV& sourceMV = dynamic_cast<const MV&> (sourceObj);
827  const size_t numCols = this->getNumVectors ();
828 
829  // The input sourceObj controls whether we copy on host or device.
830  // This is because this method is not allowed to modify sourceObj,
831  // so it may not sync sourceObj; it must use the most recently
832  // updated version (host or device) of sourceObj's data.
833  //
834  // If we need sync to device, then host has the most recent version.
835  const bool copyOnHost = sourceMV.template need_sync<device_type> ();
836 
837  if (copyOnHost) {
838  this->template sync<Kokkos::HostSpace> ();
839  this->template modify<Kokkos::HostSpace> ();
840  }
841  else {
842  this->template sync<device_type> ();
843  this->template modify<device_type> ();
844  }
845 
846  // TODO (mfh 15 Sep 2013) When we replace
847  // KokkosClassic::MultiVector with a Kokkos::View, there are two
848  // ways to copy the data:
849  //
850  // 1. Get a (sub)view of each column and call deep_copy on that.
851  // 2. Write a custom kernel to copy the data.
852  //
853  // The first is easier, but the second might be more performant in
854  // case we decide to use layouts other than LayoutLeft. It might
855  // even make sense to hide whichVectors_ in an entirely new layout
856  // for Kokkos Views.
857 
858  // Copy rows [0, numSameIDs-1] of the local multivectors.
859  //
860  // Note (ETP 2 Jul 2014) We need to always copy one column at a
861  // time, even when both multivectors are constant-stride, since
862  // deep_copy between strided subviews with more than one column
863  // doesn't currently work.
864 
865  if (numSameIDs > 0) {
866  const std::pair<size_t, size_t> rows (0, numSameIDs);
867  if (copyOnHost) {
868  auto tgt_h = this->template getLocalView<HMS> ();
869  auto src_h = create_const_view (sourceMV.template getLocalView<HMS> ());
870 
871  for (size_t j = 0; j < numCols; ++j) {
872  const size_t tgtCol = isConstantStride () ? j : whichVectors_[j];
873  const size_t srcCol =
874  sourceMV.isConstantStride () ? j : sourceMV.whichVectors_[j];
875 
876  auto tgt_j = Kokkos::subview (tgt_h, rows, tgtCol);
877  auto src_j = Kokkos::subview (src_h, rows, srcCol);
878  Kokkos::deep_copy (tgt_j, src_j); // Copy src_j into tgt_j
879  }
880  }
881  else { // copy on device
882  auto tgt_d = this->template getLocalView<DMS> ();
883  auto src_d = create_const_view (sourceMV.template getLocalView<DMS> ());
884 
885  for (size_t j = 0; j < numCols; ++j) {
886  const size_t tgtCol = isConstantStride () ? j : whichVectors_[j];
887  const size_t srcCol =
888  sourceMV.isConstantStride () ? j : sourceMV.whichVectors_[j];
889 
890  auto tgt_j = Kokkos::subview (tgt_d, rows, tgtCol);
891  auto src_j = Kokkos::subview (src_d, rows, srcCol);
892  Kokkos::deep_copy (tgt_j, src_j); // Copy src_j into tgt_j
893  }
894  }
895  }
896 
897  // For the remaining GIDs, execute the permutations. This may
898  // involve noncontiguous access of both source and destination
899  // vectors, depending on the LID lists.
900  //
901  // FIXME (mfh 20 June 2012) For an Export with duplicate GIDs on
902  // the same process, this merges their values by replacement of
903  // the last encountered GID, not by the specified merge rule
904  // (such as ADD).
905 
906  // If there are no permutations, we are done
907  if (permuteFromLIDs.dimension_0 () == 0 ||
908  permuteToLIDs.dimension_0 () == 0) {
909  return;
910  }
911 
912  // This gets around const-ness of the DualView input. In
913  // particular, it gives us freedom to sync them where we need
914  // them.
915  Kokkos::DualView<const LocalOrdinal*, device_type> permuteToLIDs_nc =
916  permuteToLIDs;
917  Kokkos::DualView<const LocalOrdinal*, device_type> permuteFromLIDs_nc =
918  permuteFromLIDs;
919 
920  // We could in theory optimize for the case where exactly one of
921  // them is constant stride, but we don't currently do that.
922  const bool nonConstStride =
923  ! this->isConstantStride () || ! sourceMV.isConstantStride ();
924 
925  // We only need the "which vectors" arrays if either the source or
926  // target MV is not constant stride. Since we only have one
927  // kernel that must do double-duty, we have to create a "which
928  // vectors" array for the MV that _is_ constant stride.
929  Kokkos::DualView<const size_t*, device_type> srcWhichVecs;
930  Kokkos::DualView<const size_t*, device_type> tgtWhichVecs;
931  if (nonConstStride) {
932  if (this->whichVectors_.size () == 0) {
933  Kokkos::DualView<size_t*, device_type> tmpTgt ("tgtWhichVecs", numCols);
934  tmpTgt.template modify<HMS> ();
935  for (size_t j = 0; j < numCols; ++j) {
936  tmpTgt.h_view(j) = j;
937  }
938  if (! copyOnHost) {
939  tmpTgt.template sync<DMS> ();
940  }
941  tgtWhichVecs = tmpTgt;
942  }
943  else {
944  Teuchos::ArrayView<const size_t> tgtWhichVecsT = this->whichVectors_ ();
945  tgtWhichVecs =
946  getDualViewCopyFromArrayView<size_t, device_type> (tgtWhichVecsT,
947  "tgtWhichVecs",
948  copyOnHost);
949  }
950  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
951  (static_cast<size_t> (tgtWhichVecs.dimension_0 ()) !=
952  this->getNumVectors (),
953  std::logic_error, "tgtWhichVecs.dimension_0() = " <<
954  tgtWhichVecs.dimension_0 () << " != this->getNumVectors() = " <<
955  this->getNumVectors () << ".");
956 
957  if (sourceMV.whichVectors_.size () == 0) {
958  Kokkos::DualView<size_t*, device_type> tmpSrc ("srcWhichVecs", numCols);
959  tmpSrc.template modify<HMS> ();
960  for (size_t j = 0; j < numCols; ++j) {
961  tmpSrc.h_view(j) = j;
962  }
963  if (! copyOnHost) {
964  tmpSrc.template sync<DMS> ();
965  }
966  srcWhichVecs = tmpSrc;
967  }
968  else {
969  Teuchos::ArrayView<const size_t> srcWhichVecsT =
970  sourceMV.whichVectors_ ();
971  srcWhichVecs =
972  getDualViewCopyFromArrayView<size_t, device_type> (srcWhichVecsT,
973  "srcWhichVecs",
974  copyOnHost);
975  }
976  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
977  (static_cast<size_t> (srcWhichVecs.dimension_0 ()) !=
978  sourceMV.getNumVectors (), std::logic_error,
979  "srcWhichVecs.dimension_0() = " << srcWhichVecs.dimension_0 ()
980  << " != sourceMV.getNumVectors() = " << sourceMV.getNumVectors ()
981  << ".");
982  }
983 
984  if (copyOnHost) {
985  auto tgt_h = this->template getLocalView<HMS> ();
986  auto src_h = create_const_view (sourceMV.template getLocalView<HMS> ());
987  permuteToLIDs_nc.template sync<HMS> ();
988  auto permuteToLIDs_h =
989  create_const_view (permuteToLIDs_nc.template view<HMS> ());
990  permuteFromLIDs_nc.template sync<HMS> ();
991  auto permuteFromLIDs_h =
992  create_const_view (permuteFromLIDs_nc.template view<HMS> ());
993 
994  if (nonConstStride) {
995  // No need to sync first, because copyOnHost argument to
996  // getDualViewCopyFromArrayView puts them in the right place.
997  auto tgtWhichVecs_h =
998  create_const_view (tgtWhichVecs.template view<HMS> ());
999  auto srcWhichVecs_h =
1000  create_const_view (srcWhichVecs.template view<HMS> ());
1001  permute_array_multi_column_variable_stride (tgt_h, src_h,
1002  permuteToLIDs_h,
1003  permuteFromLIDs_h,
1004  tgtWhichVecs_h,
1005  srcWhichVecs_h, numCols);
1006  }
1007  else {
1008  permute_array_multi_column (tgt_h, src_h, permuteToLIDs_h,
1009  permuteFromLIDs_h, numCols);
1010  }
1011  }
1012  else { // copy on device
1013  auto tgt_d = this->template getLocalView<DMS> ();
1014  auto src_d = create_const_view (sourceMV.template getLocalView<DMS> ());
1015  permuteToLIDs_nc.template sync<DMS> ();
1016  auto permuteToLIDs_d =
1017  create_const_view (permuteToLIDs_nc.template view<DMS> ());
1018  permuteFromLIDs_nc.template sync<DMS> ();
1019  auto permuteFromLIDs_d =
1020  create_const_view (permuteFromLIDs_nc.template view<DMS> ());
1021 
1022  if (nonConstStride) {
1023  // No need to sync first, because copyOnHost argument to
1024  // getDualViewCopyFromArrayView puts them in the right place.
1025  auto tgtWhichVecs_d =
1026  create_const_view (tgtWhichVecs.template view<DMS> ());
1027  auto srcWhichVecs_d =
1028  create_const_view (srcWhichVecs.template view<DMS> ());
1029  permute_array_multi_column_variable_stride (tgt_d, src_d,
1030  permuteToLIDs_d,
1031  permuteFromLIDs_d,
1032  tgtWhichVecs_d,
1033  srcWhichVecs_d, numCols);
1034  }
1035  else {
1036  permute_array_multi_column (tgt_d, src_d, permuteToLIDs_d,
1037  permuteFromLIDs_d, numCols);
1038  }
1039  }
1040  }
1041 
1042  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node, const bool classic>
1043  void
1044  MultiVector<Scalar, LocalOrdinal, GlobalOrdinal, Node, classic>::
1045  packAndPrepareNew (const SrcDistObject& sourceObj,
1046  const Kokkos::DualView<const local_ordinal_type*, device_type> &exportLIDs,
1047  Kokkos::DualView<impl_scalar_type*, device_type>& exports,
1048  const Kokkos::DualView<size_t*, device_type>& /* numExportPacketsPerLID */,
1049  size_t& constantNumPackets,
1050  Distributor & /* distor */ )
1051  {
1052  using Kokkos::Compat::create_const_view;
1053  using Kokkos::Compat::getKokkosViewDeepCopy;
1054  typedef MultiVector<Scalar, LocalOrdinal, GlobalOrdinal, Node, classic> MV;
1055  typedef impl_scalar_type IST;
1056  typedef typename Kokkos::DualView<IST*, device_type>::t_host::memory_space
1057  host_memory_space;
1058  typedef typename Kokkos::DualView<IST*, device_type>::t_dev::memory_space
1059  dev_memory_space;
1060  typedef typename Kokkos::DualView<IST*, device_type>::t_host::execution_space
1061  host_execution_space;
1062  typedef typename Kokkos::DualView<IST*, device_type>::t_dev::execution_space
1063  dev_execution_space;
1064 
1065  // TODO (mfh 09 Sep 2016): The pack and unpack functions now have
1066  // the option to check indices. We do so in a debug build. At
1067  // some point, it would make sense to shift this to a run-time
1068  // option, controlled by environment variable.
1069 #ifdef HAVE_TPETRA_DEBUG
1070  constexpr bool debugCheckIndices = true;
1071 #else
1072  constexpr bool debugCheckIndices = false;
1073 #endif // HAVE_TPETRA_DEBUG
1074 
1075  const bool printDebugOutput = false;
1076  if (printDebugOutput) {
1077  std::cerr << "$$$ MV::packAndPrepareNew" << std::endl;
1078  }
1079  // We've already called checkSizes(), so this cast must succeed.
1080  const MV& sourceMV = dynamic_cast<const MV&> (sourceObj);
1081 
1082  // mfh 12 Apr 2016: packAndPrepareNew decides where to pack based
1083  // on the memory space in which exportLIDs was last modified.
1084  // (DistObject::doTransferNew decides this currently.)
1085  //
1086  // We unfortunately can't change the source object sourceMV.
1087  // Thus, we can't sync it to the memory space where we want to
1088  // pack communication buffers. As a result, for example, if
1089  // exportLIDs wants us to pack on host, but sourceMV was last
1090  // modified on device, we have to allocate a temporary host
1091  // version and copy back to host before we can pack. Similarly,
1092  // if exportLIDs wants us to pack on device, but sourceMV was last
1093  // modified on host, we have to allocate a temporary device
1094  // version and copy back to device before we can pack.
1095  const bool packOnHost =
1096  exportLIDs.modified_host () > exportLIDs.modified_device ();
1097  auto src_dev = sourceMV.template getLocalView<dev_memory_space> ();
1098  auto src_host = sourceMV.template getLocalView<host_memory_space> ();
1099  if (packOnHost) {
1100  if (sourceMV.template need_sync<Kokkos::HostSpace> ()) {
1101  // sourceMV was most recently updated on device; copy to host.
1102  // Allocate a new host mirror. We'll use it for packing below.
1103  src_host = decltype (src_host) ("MV::DualView::h_view",
1104  src_dev.dimension_0 (),
1105  src_dev.dimension_1 ());
1106  Kokkos::deep_copy (src_host, src_dev);
1107  }
1108  }
1109  else { // pack on device
1110  if (sourceMV.template need_sync<device_type> ()) {
1111  // sourceMV was most recently updated on host; copy to device.
1112  // Allocate a new "device mirror." We'll use it for packing below.
1113  src_dev = decltype (src_dev) ("MV::DualView::d_view",
1114  src_host.dimension_0 (),
1115  src_host.dimension_1 ());
1116  Kokkos::deep_copy (src_dev, src_host);
1117  }
1118  }
1119 
1120  const size_t numCols = sourceMV.getNumVectors ();
1121 
1122  // This spares us from needing to fill numExportPacketsPerLID.
1123  // Setting constantNumPackets to a nonzero value signals that
1124  // all packets have the same number of entries.
1125  constantNumPackets = numCols;
1126 
1127  // If we have no exports, there is nothing to do. Make sure this
1128  // goes _after_ setting constantNumPackets correctly.
1129  if (exportLIDs.dimension_0 () == 0) {
1130  if (printDebugOutput) {
1131  std::cerr << "$$$ MV::packAndPrepareNew DONE" << std::endl;
1132  }
1133  return;
1134  }
1135 
1136  /* The layout in the export for MultiVectors is as follows:
1137  exports = { all of the data from row exportLIDs.front() ;
1138  ....
1139  all of the data from row exportLIDs.back() }
1140  This doesn't have the best locality, but is necessary because
1141  the data for a Packet (all data associated with an LID) is
1142  required to be contiguous. */
1143 
1144  // FIXME (mfh 15 Sep 2013) Would it make sense to rethink the
1145  // packing scheme in the above comment? The data going to a
1146  // particular process must be contiguous, of course, but those
1147  // data could include entries from multiple LIDs. DistObject just
1148  // needs to know how to index into that data. Kokkos is good at
1149  // decoupling storage intent from data layout choice.
1150 
1151  if (printDebugOutput) {
1152  std::cerr << "$$$ MV::packAndPrepareNew realloc" << std::endl;
1153  }
1154 
1155  const size_t numExportLIDs = exportLIDs.dimension_0 ();
1156  const size_t newExportsSize = numCols * numExportLIDs;
1157  if (static_cast<size_t> (exports.dimension_0 ()) != newExportsSize) {
1158  if (printDebugOutput) {
1159  std::ostringstream os;
1160  const int myRank = this->getMap ()->getComm ()->getRank ();
1161  os << "$$$ MV::packAndPrepareNew (Proc " << myRank << ") realloc "
1162  "exports from " << exports.dimension_0 () << " to " << newExportsSize
1163  << std::endl;
1164  std::cerr << os.str ();
1165  }
1166  // Resize 'exports' to fit.
1167  execution_space::fence ();
1168  exports = Kokkos::DualView<impl_scalar_type*, device_type> ("exports", newExportsSize);
1169  execution_space::fence ();
1170  }
1171 
1172  // Mark 'exports' here, since we might resize it above. Resizing
1173  // currently requires calling the constructor, which clears out
1174  // the 'modified' flags.
1175  if (packOnHost) {
1176  exports.template modify<host_memory_space> ();
1177  }
1178  else {
1179  exports.template modify<dev_memory_space> ();
1180  }
1181 
1182  if (numCols == 1) { // special case for one column only
1183  // MultiVector always represents a single column with constant
1184  // stride, but it doesn't hurt to implement both cases anyway.
1185  //
1186  // ETP: I'm not sure I agree with the above statement. Can't a single-
1187  // column multivector be a subview of another multi-vector, in which case
1188  // sourceMV.whichVectors_[0] != 0 ? I think we have to handle that case
1189  // separately.
1190  //
1191  // mfh 18 Jan 2016: In answer to ETP's comment above:
1192  // MultiVector treats single-column MultiVectors created using a
1193  // "nonconstant stride constructor" as a special case, and makes
1194  // them constant stride (by making whichVectors_ have length 0).
1195  if (sourceMV.isConstantStride ()) {
1196  using KokkosRefactor::Details::pack_array_single_column;
1197  if (printDebugOutput) {
1198  std::cerr << "$$$ MV::packAndPrepareNew pack numCols=1 const stride" << std::endl;
1199  }
1200  if (packOnHost) {
1201  pack_array_single_column (exports.template view<host_memory_space> (),
1202  create_const_view (src_host),
1203  exportLIDs.template view<host_memory_space> (),
1204  0,
1205  debugCheckIndices);
1206  }
1207  else { // pack on device
1208  pack_array_single_column (exports.template view<dev_memory_space> (),
1209  create_const_view (src_dev),
1210  exportLIDs.template view<dev_memory_space> (),
1211  0,
1212  debugCheckIndices);
1213  }
1214  }
1215  else {
1216  using KokkosRefactor::Details::pack_array_single_column;
1217  if (printDebugOutput) {
1218  std::cerr << "$$$ MV::packAndPrepareNew pack numCols=1 nonconst stride" << std::endl;
1219  }
1220  if (packOnHost) {
1221  pack_array_single_column (exports.template view<host_memory_space> (),
1222  create_const_view (src_host),
1223  exportLIDs.template view<host_memory_space> (),
1224  sourceMV.whichVectors_[0],
1225  debugCheckIndices);
1226  }
1227  else { // pack on device
1228  pack_array_single_column (exports.template view<dev_memory_space> (),
1229  create_const_view (src_dev),
1230  exportLIDs.template view<dev_memory_space> (),
1231  sourceMV.whichVectors_[0],
1232  debugCheckIndices);
1233  }
1234  }
1235  }
1236  else { // the source MultiVector has multiple columns
1237  if (sourceMV.isConstantStride ()) {
1238  using KokkosRefactor::Details::pack_array_multi_column;
1239  if (printDebugOutput) {
1240  std::cerr << "$$$ MV::packAndPrepareNew pack numCols>1 const stride" << std::endl;
1241  }
1242  if (packOnHost) {
1243  pack_array_multi_column (exports.template view<host_memory_space> (),
1244  create_const_view (src_host),
1245  exportLIDs.template view<host_memory_space> (),
1246  numCols,
1247  debugCheckIndices);
1248  }
1249  else { // pack on device
1250  pack_array_multi_column (exports.template view<dev_memory_space> (),
1251  create_const_view (src_dev),
1252  exportLIDs.template view<dev_memory_space> (),
1253  numCols,
1254  debugCheckIndices);
1255  }
1256  }
1257  else {
1258  using KokkosRefactor::Details::pack_array_multi_column_variable_stride;
1259  if (printDebugOutput) {
1260  std::cerr << "$$$ MV::packAndPrepareNew pack numCols>1 nonconst stride" << std::endl;
1261  }
1262  if (packOnHost) {
1263  pack_array_multi_column_variable_stride (exports.template view<host_memory_space> (),
1264  create_const_view (src_host),
1265  exportLIDs.template view<host_memory_space> (),
1266  getKokkosViewDeepCopy<host_execution_space> (sourceMV.whichVectors_ ()),
1267  numCols,
1268  debugCheckIndices);
1269  }
1270  else { // pack on device
1271  pack_array_multi_column_variable_stride (exports.template view<dev_memory_space> (),
1272  create_const_view (src_dev),
1273  exportLIDs.template view<dev_memory_space> (),
1274  getKokkosViewDeepCopy<dev_execution_space> (sourceMV.whichVectors_ ()),
1275  numCols,
1276  debugCheckIndices);
1277  }
1278  }
1279  }
1280 
1281  if (printDebugOutput) {
1282  std::cerr << "$$$ MV::packAndPrepareNew DONE" << std::endl;
1283  }
1284  }
1285 
1286 
1287  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node, const bool classic>
1288  void
1289  MultiVector<Scalar, LocalOrdinal, GlobalOrdinal, Node, classic>::
1290  unpackAndCombineNew (const Kokkos::DualView<const local_ordinal_type*, device_type>& importLIDs,
1291  const Kokkos::DualView<const impl_scalar_type*, device_type>& imports,
1292  const Kokkos::DualView<const size_t*, device_type>& /* numPacketsPerLID */,
1293  const size_t constantNumPackets,
1294  Distributor& /* distor */,
1295  const CombineMode CM)
1296  {
1297  using KokkosRefactor::Details::unpack_array_multi_column;
1298  using KokkosRefactor::Details::unpack_array_multi_column_variable_stride;
1299  using Kokkos::Compat::getKokkosViewDeepCopy;
1300  typedef impl_scalar_type IST;
1301  typedef typename Kokkos::DualView<IST*, device_type>::t_host::memory_space
1302  host_memory_space;
1303  typedef typename Kokkos::DualView<IST*, device_type>::t_dev::memory_space
1304  dev_memory_space;
1305  const char tfecfFuncName[] = "unpackAndCombineNew: ";
1306  const char suffix[] = " Please report this bug to the Tpetra developers.";
1307 
1308  // TODO (mfh 09 Sep 2016): The pack and unpack functions now have
1309  // the option to check indices. We do so in a debug build. At
1310  // some point, it would make sense to shift this to a run-time
1311  // option, controlled by environment variable.
1312 #ifdef HAVE_TPETRA_DEBUG
1313  constexpr bool debugCheckIndices = true;
1314 #else
1315  constexpr bool debugCheckIndices = false;
1316 #endif // HAVE_TPETRA_DEBUG
1317 
1318  // If we have no imports, there is nothing to do
1319  if (importLIDs.dimension_0 () == 0) {
1320  return;
1321  }
1322 
1323  const size_t numVecs = getNumVectors ();
1324 #ifdef HAVE_TPETRA_DEBUG
1325  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
1326  static_cast<size_t> (imports.dimension_0 ()) !=
1327  numVecs * importLIDs.dimension_0 (),
1328  std::runtime_error,
1329  "imports.dimension_0() = " << imports.dimension_0 ()
1330  << " != getNumVectors() * importLIDs.dimension_0() = " << numVecs
1331  << " * " << importLIDs.dimension_0 () << " = "
1332  << numVecs * importLIDs.dimension_0 () << ".");
1333 
1334  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
1335  (constantNumPackets == static_cast<size_t> (0), std::runtime_error,
1336  "constantNumPackets input argument must be nonzero.");
1337 
1338  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
1339  (static_cast<size_t> (numVecs) !=
1340  static_cast<size_t> (constantNumPackets),
1341  std::runtime_error, "constantNumPackets must equal numVecs.");
1342 #endif // HAVE_TPETRA_DEBUG
1343 
1344  // mfh 12 Apr 2016: Decide where to unpack based on the memory
1345  // space in which the imports buffer was last modified.
1346  // DistObject::doTransferNew gets to decide this. We currently
1347  // require importLIDs to match (its most recent version must be in
1348  // the same memory space as imports' most recent version).
1349  const bool unpackOnHost =
1350  imports.modified_host () > imports.modified_device ();
1351  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
1352  (unpackOnHost && importLIDs.modified_host () < importLIDs.modified_device (),
1353  std::logic_error, "The 'imports' buffer was last modified on host, "
1354  "but importLIDs was last modified on device." << suffix);
1355  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
1356  (! unpackOnHost && importLIDs.modified_host () > importLIDs.modified_device (),
1357  std::logic_error, "The 'imports' buffer was last modified on device, "
1358  "but importLIDs was last modified on host." << suffix);
1359 
1360  // We have to sync before modifying, because this method may read
1361  // as well as write (depending on the CombineMode). This matters
1362  // because copyAndPermute may have modified *this in the other
1363  // memory space.
1364  if (unpackOnHost) {
1365  this->template sync<host_memory_space> ();
1366  this->template modify<host_memory_space> ();
1367  }
1368  else { // unpack on device
1369  this->template sync<dev_memory_space> ();
1370  this->template modify<dev_memory_space> ();
1371  }
1372  auto X_d = this->template getLocalView<dev_memory_space> ();
1373  auto X_h = this->template getLocalView<host_memory_space> ();
1374  auto imports_d = imports.template view<dev_memory_space> ();
1375  auto imports_h = imports.template view<host_memory_space> ();
1376  auto importLIDs_d = importLIDs.template view<dev_memory_space> ();
1377  auto importLIDs_h = importLIDs.template view<host_memory_space> ();
1378 
1379  Kokkos::DualView<size_t*, device_type> whichVecs;
1380  if (! isConstantStride ()) {
1381  Kokkos::View<const size_t*, host_memory_space,
1382  Kokkos::MemoryUnmanaged> whichVecsIn (whichVectors_.getRawPtr (),
1383  numVecs);
1384  whichVecs = Kokkos::DualView<size_t*, device_type> ("whichVecs", numVecs);
1385  if (unpackOnHost) {
1386  whichVecs.template modify<host_memory_space> ();
1387  Kokkos::deep_copy (whichVecs.template view<host_memory_space> (), whichVecsIn);
1388  }
1389  else {
1390  whichVecs.template modify<dev_memory_space> ();
1391  Kokkos::deep_copy (whichVecs.template view<dev_memory_space> (), whichVecsIn);
1392  }
1393  }
1394  auto whichVecs_d = whichVecs.template view<dev_memory_space> ();
1395  auto whichVecs_h = whichVecs.template view<host_memory_space> ();
1396 
1397  /* The layout in the export for MultiVectors is as follows:
1398  imports = { all of the data from row exportLIDs.front() ;
1399  ....
1400  all of the data from row exportLIDs.back() }
1401  This doesn't have the best locality, but is necessary because
1402  the data for a Packet (all data associated with an LID) is
1403  required to be contiguous. */
1404 
1405  if (numVecs > 0 && importLIDs.dimension_0 () > 0) {
1406  // NOTE (mfh 10 Mar 2012, 24 Mar 2014) If you want to implement
1407  // custom combine modes, start editing here. Also, if you trust
1408  // inlining, it would be nice to condense this code by using a
1409  // binary function object f in the pack functors.
1410  if (CM == INSERT || CM == REPLACE) {
1411  const auto op = KokkosRefactor::Details::InsertOp ();
1412 
1413  if (isConstantStride ()) {
1414  if (unpackOnHost) {
1415  unpack_array_multi_column (X_h, imports_h, importLIDs_h, op,
1416  numVecs, debugCheckIndices);
1417 
1418  }
1419  else { // unpack on device
1420  unpack_array_multi_column (X_d, imports_d, importLIDs_d, op,
1421  numVecs, debugCheckIndices);
1422  }
1423  }
1424  else { // not constant stride
1425  if (unpackOnHost) {
1426  unpack_array_multi_column_variable_stride (X_h, imports_h,
1427  importLIDs_h,
1428  whichVecs_h, op,
1429  numVecs,
1430  debugCheckIndices);
1431  }
1432  else { // unpack on device
1433  unpack_array_multi_column_variable_stride (X_d, imports_d,
1434  importLIDs_d,
1435  whichVecs_d, op,
1436  numVecs,
1437  debugCheckIndices);
1438  }
1439  }
1440  }
1441  else if (CM == ADD) {
1442  const auto op = KokkosRefactor::Details::AddOp ();
1443 
1444  if (isConstantStride ()) {
1445  if (unpackOnHost) {
1446  unpack_array_multi_column (X_h, imports_h, importLIDs_h, op,
1447  numVecs, debugCheckIndices);
1448  }
1449  else { // unpack on device
1450  unpack_array_multi_column (X_d, imports_d, importLIDs_d, op,
1451  numVecs, debugCheckIndices);
1452  }
1453  }
1454  else { // not constant stride
1455  if (unpackOnHost) {
1456  unpack_array_multi_column_variable_stride (X_h, imports_h,
1457  importLIDs_h,
1458  whichVecs_h, op,
1459  numVecs,
1460  debugCheckIndices);
1461  }
1462  else { // unpack on device
1463  unpack_array_multi_column_variable_stride (X_d, imports_d,
1464  importLIDs_d,
1465  whichVecs_d, op,
1466  numVecs,
1467  debugCheckIndices);
1468  }
1469  }
1470  }
1471  else if (CM == ABSMAX) {
1472  const auto op = KokkosRefactor::Details::AbsMaxOp ();
1473 
1474  if (isConstantStride ()) {
1475  if (unpackOnHost) {
1476  unpack_array_multi_column (X_h, imports_h, importLIDs_h, op,
1477  numVecs, debugCheckIndices);
1478  }
1479  else { // unpack on device
1480  unpack_array_multi_column (X_d, imports_d, importLIDs_d, op,
1481  numVecs, debugCheckIndices);
1482  }
1483  }
1484  else {
1485  if (unpackOnHost) {
1486  unpack_array_multi_column_variable_stride (X_h, imports_h,
1487  importLIDs_h,
1488  whichVecs_h, op,
1489  numVecs,
1490  debugCheckIndices);
1491  }
1492  else { // unpack on device
1493  unpack_array_multi_column_variable_stride (X_d, imports_d,
1494  importLIDs_d,
1495  whichVecs_d, op,
1496  numVecs,
1497  debugCheckIndices);
1498  }
1499  }
1500  }
1501  else {
1502  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
1503  (CM != ADD && CM != REPLACE && CM != INSERT && CM != ABSMAX,
1504  std::invalid_argument, "Invalid CombineMode: " << CM << ". Valid "
1505  "CombineMode values are ADD, REPLACE, INSERT, and ABSMAX.");
1506  }
1507  }
1508  }
1509 
1510  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node, const bool classic>
1511  size_t
1514  {
1515  if (isConstantStride ()) {
1516  return static_cast<size_t> (view_.dimension_1 ());
1517  } else {
1518  return static_cast<size_t> (whichVectors_.size ());
1519  }
1520  }
1521 
1522  namespace { // (anonymous)
1523 
1524  template<class RV, class XMV>
1525  void
1526  lclDotImpl (const RV& dotsOut,
1527  const XMV& X_lcl,
1528  const XMV& Y_lcl,
1529  const size_t lclNumRows,
1530  const size_t numVecs,
1531  const Teuchos::ArrayView<const size_t>& whichVecsX,
1532  const Teuchos::ArrayView<const size_t>& whichVecsY,
1533  const bool constantStrideX,
1534  const bool constantStrideY)
1535  {
1536  using Kokkos::ALL;
1537  using Kokkos::subview;
1538  typedef typename RV::non_const_value_type dot_type;
1539 #ifdef HAVE_TPETRA_DEBUG
1540  const char prefix[] = "Tpetra::MultiVector::lclDotImpl: ";
1541 #endif // HAVE_TPETRA_DEBUG
1542 
1543  static_assert (Kokkos::Impl::is_view<RV>::value,
1544  "Tpetra::MultiVector::lclDotImpl: "
1545  "The first argument dotsOut is not a Kokkos::View.");
1546  static_assert (RV::rank == 1, "Tpetra::MultiVector::lclDotImpl: "
1547  "The first argument dotsOut must have rank 1.");
1548  static_assert (Kokkos::Impl::is_view<XMV>::value,
1549  "Tpetra::MultiVector::lclDotImpl: The type of the 2nd and "
1550  "3rd arguments (X_lcl and Y_lcl) is not a Kokkos::View.");
1551  static_assert (XMV::rank == 2, "Tpetra::MultiVector::lclDotImpl: "
1552  "X_lcl and Y_lcl must have rank 2.");
1553 
1554  // In case the input dimensions don't match, make sure that we
1555  // don't overwrite memory that doesn't belong to us, by using
1556  // subset views with the minimum dimensions over all input.
1557  const std::pair<size_t, size_t> rowRng (0, lclNumRows);
1558  const std::pair<size_t, size_t> colRng (0, numVecs);
1559  RV theDots = subview (dotsOut, colRng);
1560  XMV X = subview (X_lcl, rowRng, Kokkos::ALL());
1561  XMV Y = subview (Y_lcl, rowRng, Kokkos::ALL());
1562 
1563 #ifdef HAVE_TPETRA_DEBUG
1564  if (lclNumRows != 0) {
1565  TEUCHOS_TEST_FOR_EXCEPTION
1566  (X.dimension_0 () != lclNumRows, std::logic_error, prefix <<
1567  "X.dimension_0() = " << X.dimension_0 () << " != lclNumRows "
1568  "= " << lclNumRows << ". "
1569  "Please report this bug to the Tpetra developers.");
1570  TEUCHOS_TEST_FOR_EXCEPTION
1571  (Y.dimension_0 () != lclNumRows, std::logic_error, prefix <<
1572  "Y.dimension_0() = " << Y.dimension_0 () << " != lclNumRows "
1573  "= " << lclNumRows << ". "
1574  "Please report this bug to the Tpetra developers.");
1575  // If a MultiVector is constant stride, then numVecs should
1576  // equal its View's number of columns. Otherwise, numVecs
1577  // should be less than its View's number of columns.
1578  TEUCHOS_TEST_FOR_EXCEPTION
1579  (constantStrideX &&
1580  (X.dimension_0 () != lclNumRows || X.dimension_1 () != numVecs),
1581  std::logic_error, prefix << "X is " << X.dimension_0 () << " x " <<
1582  X.dimension_1 () << " (constant stride), which differs from the "
1583  "local dimensions " << lclNumRows << " x " << numVecs << ". "
1584  "Please report this bug to the Tpetra developers.");
1585  TEUCHOS_TEST_FOR_EXCEPTION
1586  (! constantStrideX &&
1587  (X.dimension_0 () != lclNumRows || X.dimension_1 () < numVecs),
1588  std::logic_error, prefix << "X is " << X.dimension_0 () << " x " <<
1589  X.dimension_1 () << " (NOT constant stride), but the local "
1590  "dimensions are " << lclNumRows << " x " << numVecs << ". "
1591  "Please report this bug to the Tpetra developers.");
1592  TEUCHOS_TEST_FOR_EXCEPTION
1593  (constantStrideY &&
1594  (Y.dimension_0 () != lclNumRows || Y.dimension_1 () != numVecs),
1595  std::logic_error, prefix << "Y is " << Y.dimension_0 () << " x " <<
1596  Y.dimension_1 () << " (constant stride), which differs from the "
1597  "local dimensions " << lclNumRows << " x " << numVecs << ". "
1598  "Please report this bug to the Tpetra developers.");
1599  TEUCHOS_TEST_FOR_EXCEPTION
1600  (! constantStrideY &&
1601  (Y.dimension_0 () != lclNumRows || Y.dimension_1 () < numVecs),
1602  std::logic_error, prefix << "Y is " << Y.dimension_0 () << " x " <<
1603  Y.dimension_1 () << " (NOT constant stride), but the local "
1604  "dimensions are " << lclNumRows << " x " << numVecs << ". "
1605  "Please report this bug to the Tpetra developers.");
1606  }
1607 #endif // HAVE_TPETRA_DEBUG
1608 
1609  if (lclNumRows == 0) {
1610  const dot_type zero = Kokkos::Details::ArithTraits<dot_type>::zero ();
1611  Kokkos::deep_copy(theDots, zero);
1612  }
1613  else { // lclNumRows != 0
1614  if (constantStrideX && constantStrideY) {
1615  if(X.dimension_1() == 1) {
1616  typename RV::non_const_value_type result =
1617  KokkosBlas::dot (Kokkos::subview(X,Kokkos::ALL(),0),
1618  Kokkos::subview(Y,Kokkos::ALL(),0));
1619  Kokkos::deep_copy(theDots,result);
1620  } else
1621  KokkosBlas::dot (theDots, X, Y);
1622  }
1623  else { // not constant stride
1624  // NOTE (mfh 15 Jul 2014) This does a kernel launch for
1625  // every column. It might be better to have a kernel that
1626  // does the work all at once. On the other hand, we don't
1627  // prioritize performance of MultiVector views of
1628  // noncontiguous columns.
1629  for (size_t k = 0; k < numVecs; ++k) {
1630  const size_t X_col = constantStrideX ? k : whichVecsX[k];
1631  const size_t Y_col = constantStrideY ? k : whichVecsY[k];
1632  KokkosBlas::dot (subview (theDots, k), subview (X, ALL (), X_col),
1633  subview (Y, ALL (), Y_col));
1634  } // for each column
1635  } // constantStride
1636  } // lclNumRows != 0
1637  }
1638 
1639  template<class RV>
1640  void
1641  gblDotImpl (const RV& dotsOut,
1642  const Teuchos::RCP<const Teuchos::Comm<int> >& comm,
1643  const bool distributed)
1644  {
1645  using Teuchos::REDUCE_MAX;
1646  using Teuchos::REDUCE_SUM;
1647  using Teuchos::reduceAll;
1648  typedef typename RV::non_const_value_type dot_type;
1649 
1650  const size_t numVecs = dotsOut.dimension_0 ();
1651 
1652  // If the MultiVector is distributed over multiple processes, do
1653  // the distributed (interprocess) part of the dot product. We
1654  // assume that the MPI implementation can read from and write to
1655  // device memory.
1656  //
1657  // replaceMap() may have removed some processes. Those
1658  // processes have a null Map. They must not participate in any
1659  // collective operations. We ask first whether the Map is null,
1660  // because isDistributed() defers that question to the Map. We
1661  // still compute and return local dots for processes not
1662  // participating in collective operations; those probably don't
1663  // make any sense, but it doesn't hurt to do them, since it's
1664  // illegal to call dot() on those processes anyway.
1665  if (distributed && ! comm.is_null ()) {
1666  // The calling process only participates in the collective if
1667  // both the Map and its Comm on that process are nonnull.
1668  //
1669  // MPI doesn't allow aliasing of arguments, so we have to make
1670  // a copy of the local sum.
1671  typename RV::non_const_type lclDots (Kokkos::ViewAllocateWithoutInitializing ("tmp"), numVecs);
1672  Kokkos::deep_copy (lclDots, dotsOut);
1673  const dot_type* const lclSum = lclDots.ptr_on_device ();
1674  dot_type* const gblSum = dotsOut.ptr_on_device ();
1675  const int nv = static_cast<int> (numVecs);
1676  reduceAll<int, dot_type> (*comm, REDUCE_SUM, nv, lclSum, gblSum);
1677  }
1678  }
1679  } // namespace (anonymous)
1680 
1681  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node, const bool classic>
1682  void
1683  MultiVector<Scalar, LocalOrdinal, GlobalOrdinal, Node, classic>::
1685  const Kokkos::View<dot_type*, device_type>& dots) const
1686  {
1687  using Kokkos::create_mirror_view;
1688  using Kokkos::subview;
1689  using Teuchos::Comm;
1690  using Teuchos::null;
1691  using Teuchos::RCP;
1692  // View of all the dot product results.
1693  typedef Kokkos::View<dot_type*, device_type> RV;
1695  const char tfecfFuncName[] = "Tpetra::MultiVector::dot: ";
1696 
1697  const size_t numVecs = this->getNumVectors ();
1698  if (numVecs == 0) {
1699  return; // nothing to do
1700  }
1701  const size_t lclNumRows = this->getLocalLength ();
1702  const size_t numDots = static_cast<size_t> (dots.dimension_0 ());
1703 
1704 #ifdef HAVE_TPETRA_DEBUG
1705  {
1706  const bool compat = this->getMap ()->isCompatible (* (A.getMap ()));
1707  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
1708  ! compat, std::invalid_argument, "Tpetra::MultiVector::dot: *this is "
1709  "not compatible with the input MultiVector A. We only test for this "
1710  "in a debug build.");
1711  }
1712 #endif // HAVE_TPETRA_DEBUG
1713 
1714  // FIXME (mfh 11 Jul 2014) These exception tests may not
1715  // necessarily be thrown on all processes consistently. We should
1716  // instead pass along error state with the inner product. We
1717  // could do this by setting an extra slot to
1718  // Kokkos::Details::ArithTraits<dot_type>::one() on error. The
1719  // final sum should be
1720  // Kokkos::Details::ArithTraits<dot_type>::zero() if not error.
1721  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
1722  lclNumRows != A.getLocalLength (), std::runtime_error,
1723  "MultiVectors do not have the same local length. "
1724  "this->getLocalLength() = " << lclNumRows << " != "
1725  "A.getLocalLength() = " << A.getLocalLength () << ".");
1726  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
1727  numVecs != A.getNumVectors (), std::runtime_error,
1728  "MultiVectors must have the same number of columns (vectors). "
1729  "this->getNumVectors() = " << numVecs << " != "
1730  "A.getNumVectors() = " << A.getNumVectors () << ".");
1731  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
1732  numDots != numVecs, std::runtime_error,
1733  "The output array 'dots' must have the same number of entries as the "
1734  "number of columns (vectors) in *this and A. dots.dimension_0() = " <<
1735  numDots << " != this->getNumVectors() = " << numVecs << ".");
1736 
1737  const std::pair<size_t, size_t> colRng (0, numVecs);
1738  RV dotsOut = subview (dots, colRng);
1739  RCP<const Comm<int> > comm = this->getMap ().is_null () ? null :
1740  this->getMap ()->getComm ();
1741 
1742  // If we need sync to device, then host has the most recent
1743  // version. A is a guest of this method, so we should sync it.
1744  // Thus, let A control where execution happens.
1745  const bool useHostVersion = A.template need_sync<device_type> ();
1746  if (useHostVersion) {
1747  // A was last modified on host, so run the local kernel there.
1748  // This means we need a host mirror of the array of norms too.
1749  typedef typename dual_view_type::t_host XMV;
1750  typedef typename XMV::memory_space cur_memory_space;
1751 
1752  // I consider it more polite to sync *this, then to sync A.
1753  // A is a "guest" of this method, and is passed in const.
1754  const_cast<MV*> (this)->template sync<cur_memory_space> ();
1755  auto thisView = this->template getLocalView<cur_memory_space> ();
1756  auto A_view = A.template getLocalView<cur_memory_space> ();
1757 
1758  lclDotImpl<RV, XMV> (dotsOut, thisView, A_view, lclNumRows, numVecs,
1759  this->whichVectors_, A.whichVectors_,
1760  this->isConstantStride (), A.isConstantStride ());
1761  auto dotsOutHost = Kokkos::create_mirror_view (dotsOut);
1762  Kokkos::deep_copy (dotsOutHost, dotsOut);
1763  gblDotImpl (dotsOutHost, comm, this->isDistributed ());
1764  Kokkos::deep_copy (dotsOut, dotsOutHost);
1765  }
1766  else {
1767  // A was last modified on device, so run the local kernel there.
1768  typedef typename dual_view_type::t_dev XMV;
1769  typedef typename XMV::memory_space cur_memory_space;
1770 
1771  // I consider it more polite to sync *this, then to sync A.
1772  // A is a "guest" of this method, and is passed in const.
1773  //
1774  // Yes, "const" is a lie.
1775  const_cast<MV*> (this)->template sync<cur_memory_space> ();
1776  auto thisView = this->template getLocalView<cur_memory_space> ();
1777  auto A_view = A.template getLocalView<cur_memory_space> ();
1778 
1779  lclDotImpl<RV, XMV> (dotsOut, thisView, A_view, lclNumRows, numVecs,
1780  this->whichVectors_, A.whichVectors_,
1781  this->isConstantStride (), A.isConstantStride ());
1782  gblDotImpl (dotsOut, comm, this->isDistributed ());
1783  }
1784  }
1785 
1786 
1787  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node, const bool classic>
1788  void
1791  const Teuchos::ArrayView<dot_type>& dots) const
1792  {
1793  typedef Kokkos::View<dot_type*, device_type> dev_dots_view_type;
1794  typedef MakeUnmanagedView<dot_type, device_type> view_getter_type;
1795  typedef typename view_getter_type::view_type host_dots_view_type;
1796 
1797  const size_t numDots = static_cast<size_t> (dots.size ());
1798  host_dots_view_type dotsHostView (dots.getRawPtr (), numDots);
1799  dev_dots_view_type dotsDevView ("MV::dot tmp", numDots);
1800  this->dot (A, dotsDevView); // Do the computation on the device.
1801  Kokkos::deep_copy (dotsHostView, dotsDevView); // Bring back result to host
1802  }
1803 
1804 
1805  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node, const bool classic>
1806  void
1808  norm2 (const Teuchos::ArrayView<mag_type>& norms) const
1809  {
1810  typedef Kokkos::View<mag_type*, device_type> dev_norms_view_type;
1811  typedef MakeUnmanagedView<mag_type, device_type> view_getter_type;
1812  typedef typename view_getter_type::view_type host_norms_view_type;
1813 
1814  const size_t numNorms = static_cast<size_t> (norms.size ());
1815  host_norms_view_type normsHostView (norms.getRawPtr (), numNorms);
1816  dev_norms_view_type normsDevView ("MV::norm2 tmp", numNorms);
1817  this->norm2 (normsDevView); // Do the computation on the device.
1818  Kokkos::deep_copy (normsHostView, normsDevView); // Bring back result to host
1819  }
1820 
1821 
1822  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node, const bool classic>
1823  void
1825  norm2 (const Kokkos::View<mag_type*, device_type>& norms) const
1826  {
1827  this->normImpl (norms, NORM_TWO);
1828  }
1829 
1830 
1831  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node, const bool classic>
1832  void TPETRA_DEPRECATED
1835  const Teuchos::ArrayView<mag_type> &norms) const
1836  {
1837  using Kokkos::ALL;
1838  using Kokkos::subview;
1839  using Teuchos::Comm;
1840  using Teuchos::null;
1841  using Teuchos::RCP;
1842  using Teuchos::reduceAll;
1843  using Teuchos::REDUCE_SUM;
1844  typedef Kokkos::Details::ArithTraits<impl_scalar_type> ATS;
1845  typedef Kokkos::Details::ArithTraits<mag_type> ATM;
1846  typedef Kokkos::View<mag_type*, device_type> norms_view_type;
1848  const char tfecfFuncName[] = "normWeighted: ";
1849 
1850  const size_t numVecs = this->getNumVectors ();
1851  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
1852  static_cast<size_t> (norms.size ()) != numVecs, std::runtime_error,
1853  "norms.size() = " << norms.size () << " != this->getNumVectors() = "
1854  << numVecs << ".");
1855 
1856  const bool OneW = (weights.getNumVectors () == 1);
1857  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
1858  ! OneW && weights.getNumVectors () != numVecs, std::runtime_error,
1859  "The input MultiVector of weights must contain either one column, "
1860  "or must have the same number of columns as *this. "
1861  "weights.getNumVectors() = " << weights.getNumVectors ()
1862  << " and this->getNumVectors() = " << numVecs << ".");
1863 
1864 #ifdef HAVE_TPETRA_DEBUG
1865  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
1866  ! this->getMap ()->isCompatible (*weights.getMap ()), std::runtime_error,
1867  "MultiVectors do not have compatible Maps:" << std::endl
1868  << "this->getMap(): " << std::endl << *this->getMap()
1869  << "weights.getMap(): " << std::endl << *weights.getMap() << std::endl);
1870 #else
1871  const size_t lclNumRows = this->getLocalLength ();
1872  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
1873  lclNumRows != weights.getLocalLength (), std::runtime_error,
1874  "MultiVectors do not have the same local length.");
1875 #endif // HAVE_TPETRA_DEBUG
1876 
1877  norms_view_type lclNrms ("lclNrms", numVecs);
1878 
1879  // FIXME (mfh 18 May 2016) Yes, I know "const" is a lie.
1880  const_cast<MV*> (this)->template sync<device_type> ();
1881  const_cast<MV&> (weights).template sync<device_type> ();
1882 
1883  auto X_lcl = this->template getLocalView<device_type> ();
1884  auto W_lcl = weights.template getLocalView<device_type> ();
1885 
1886  if (isConstantStride () && ! OneW) {
1887  KokkosBlas::nrm2w_squared (lclNrms, X_lcl, W_lcl);
1888  }
1889  else {
1890  for (size_t j = 0; j < numVecs; ++j) {
1891  const size_t X_col = this->isConstantStride () ? j :
1892  this->whichVectors_[j];
1893  const size_t W_col = OneW ? static_cast<size_t> (0) :
1894  (weights.isConstantStride () ? j : weights.whichVectors_[j]);
1895  KokkosBlas::nrm2w_squared (subview (lclNrms, j),
1896  subview (X_lcl, ALL (), X_col),
1897  subview (W_lcl, ALL (), W_col));
1898  }
1899  }
1900 
1901  const mag_type OneOverN =
1902  ATM::one () / static_cast<mag_type> (this->getGlobalLength ());
1903  RCP<const Comm<int> > comm = this->getMap ().is_null () ?
1904  Teuchos::null : this->getMap ()->getComm ();
1905 
1906  if (! comm.is_null () && this->isDistributed ()) {
1907  // Assume that MPI can access device memory.
1908  reduceAll<int, mag_type> (*comm, REDUCE_SUM, static_cast<int> (numVecs),
1909  lclNrms.ptr_on_device (), norms.getRawPtr ());
1910  for (size_t k = 0; k < numVecs; ++k) {
1911  norms[k] = ATM::sqrt (norms[k] * OneOverN);
1912  }
1913  }
1914  else {
1915  typename norms_view_type::HostMirror lclNrms_h =
1916  Kokkos::create_mirror_view (lclNrms);
1917  Kokkos::deep_copy (lclNrms_h, lclNrms);
1918  for (size_t k = 0; k < numVecs; ++k) {
1919  norms[k] = ATM::sqrt (ATS::magnitude (lclNrms_h(k)) * OneOverN);
1920  }
1921  }
1922  }
1923 
1924 
1925  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node, const bool classic>
1926  void
1928  norm1 (const Teuchos::ArrayView<mag_type>& norms) const
1929  {
1930  typedef Kokkos::View<mag_type*, device_type> dev_norms_view_type;
1931  typedef MakeUnmanagedView<mag_type, device_type> view_getter_type;
1932  typedef typename view_getter_type::view_type host_norms_view_type;
1933 
1934  const size_t numNorms = static_cast<size_t> (norms.size ());
1935  host_norms_view_type normsHostView (norms.getRawPtr (), numNorms);
1936  dev_norms_view_type normsDevView ("MV::norm1 tmp", numNorms);
1937  this->norm1 (normsDevView); // Do the computation on the device.
1938  Kokkos::deep_copy (normsHostView, normsDevView); // Bring back result to host
1939  }
1940 
1941 
1942  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node, const bool classic>
1943  void
1945  norm1 (const Kokkos::View<mag_type*, device_type>& norms) const
1946  {
1947  this->normImpl (norms, NORM_ONE);
1948  }
1949 
1950 
1951  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node, const bool classic>
1952  void
1954  normInf (const Teuchos::ArrayView<mag_type>& norms) const
1955  {
1956  typedef Kokkos::View<mag_type*, device_type> dev_norms_view_type;
1957  typedef MakeUnmanagedView<mag_type, device_type> view_getter_type;
1958  typedef typename view_getter_type::view_type host_norms_view_type;
1959 
1960  const size_t numNorms = static_cast<size_t> (norms.size ());
1961  host_norms_view_type normsHostView (norms.getRawPtr (), numNorms);
1962  dev_norms_view_type normsDevView ("MV::normInf tmp", numNorms);
1963  this->normInf (normsDevView); // Do the computation on the device.
1964  Kokkos::deep_copy (normsHostView, normsDevView); // Bring back result to host
1965  }
1966 
1967 
1968  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node, const bool classic>
1969  void
1971  normInf (const Kokkos::View<mag_type*, device_type>& norms) const
1972  {
1973  this->normImpl (norms, NORM_INF);
1974  }
1975 
1976  namespace { // (anonymous)
1977 
1980  IMPL_NORM_ONE, //<! Use the one-norm
1981  IMPL_NORM_TWO, //<! Use the two-norm
1982  IMPL_NORM_INF //<! Use the infinity-norm
1983  };
1984 
1985  template<class RV, class XMV>
1986  void
1987  lclNormImpl (const RV& normsOut,
1988  const XMV& X_lcl,
1989  const size_t lclNumRows,
1990  const size_t numVecs,
1991  const Teuchos::ArrayView<const size_t>& whichVecs,
1992  const bool constantStride,
1993  const EWhichNormImpl whichNorm)
1994  {
1995  using Kokkos::ALL;
1996  using Kokkos::subview;
1997  typedef typename RV::non_const_value_type mag_type;
1998 
1999  static_assert (Kokkos::Impl::is_view<RV>::value,
2000  "Tpetra::MultiVector::lclNormImpl: "
2001  "The first argument RV is not a Kokkos::View.");
2002  static_assert (RV::rank == 1, "Tpetra::MultiVector::lclNormImpl: "
2003  "The first argument normsOut must have rank 1.");
2004  static_assert (Kokkos::Impl::is_view<XMV>::value,
2005  "Tpetra::MultiVector::lclNormImpl: "
2006  "The second argument X_lcl is not a Kokkos::View.");
2007  static_assert (XMV::rank == 2, "Tpetra::MultiVector::lclNormImpl: "
2008  "The second argument X_lcl must have rank 2.");
2009 
2010  // In case the input dimensions don't match, make sure that we
2011  // don't overwrite memory that doesn't belong to us, by using
2012  // subset views with the minimum dimensions over all input.
2013  const std::pair<size_t, size_t> rowRng (0, lclNumRows);
2014  const std::pair<size_t, size_t> colRng (0, numVecs);
2015  RV theNorms = subview (normsOut, colRng);
2016  XMV X = subview (X_lcl, rowRng, Kokkos::ALL());
2017 
2018  // mfh 10 Mar 2015: Kokkos::(Dual)View subviews don't quite
2019  // behave how you think when they have zero rows. In that case,
2020  // it returns a 0 x 0 (Dual)View.
2021  TEUCHOS_TEST_FOR_EXCEPTION(
2022  lclNumRows != 0 && constantStride && ( \
2023  ( X.dimension_0 () != lclNumRows ) ||
2024  ( X.dimension_1 () != numVecs ) ),
2025  std::logic_error, "Constant Stride X's dimensions are " << X.dimension_0 () << " x "
2026  << X.dimension_1 () << ", which differ from the local dimensions "
2027  << lclNumRows << " x " << numVecs << ". Please report this bug to "
2028  "the Tpetra developers.");
2029 
2030  TEUCHOS_TEST_FOR_EXCEPTION(
2031  lclNumRows != 0 && !constantStride && ( \
2032  ( X.dimension_0 () != lclNumRows ) ||
2033  ( X.dimension_1 () < numVecs ) ),
2034  std::logic_error, "Strided X's dimensions are " << X.dimension_0 () << " x "
2035  << X.dimension_1 () << ", which are incompatible with the local dimensions "
2036  << lclNumRows << " x " << numVecs << ". Please report this bug to "
2037  "the Tpetra developers.");
2038 
2039  if (lclNumRows == 0) {
2040  const mag_type zeroMag = Kokkos::Details::ArithTraits<mag_type>::zero ();
2041  Kokkos::deep_copy(theNorms, zeroMag);
2042  }
2043  else { // lclNumRows != 0
2044  if (constantStride) {
2045  if (whichNorm == IMPL_NORM_INF) {
2046  KokkosBlas::nrmInf (theNorms, X);
2047  }
2048  else if (whichNorm == IMPL_NORM_ONE) {
2049  KokkosBlas::nrm1 (theNorms, X);
2050  }
2051  else if (whichNorm == IMPL_NORM_TWO) {
2052  KokkosBlas::nrm2_squared (theNorms, X);
2053  }
2054  else {
2055  TEUCHOS_TEST_FOR_EXCEPTION(true, std::logic_error, "Should never get here!");
2056  }
2057  }
2058  else { // not constant stride
2059  // NOTE (mfh 15 Jul 2014) This does a kernel launch for
2060  // every column. It might be better to have a kernel that
2061  // does the work all at once. On the other hand, we don't
2062  // prioritize performance of MultiVector views of
2063  // noncontiguous columns.
2064  for (size_t k = 0; k < numVecs; ++k) {
2065  const size_t X_col = constantStride ? k : whichVecs[k];
2066  if (whichNorm == IMPL_NORM_INF) {
2067  KokkosBlas::nrmInf (subview (theNorms, k),
2068  subview (X, ALL (), X_col));
2069  }
2070  else if (whichNorm == IMPL_NORM_ONE) {
2071  KokkosBlas::nrm1 (subview (theNorms, k),
2072  subview (X, ALL (), X_col));
2073  }
2074  else if (whichNorm == IMPL_NORM_TWO) {
2075  KokkosBlas::nrm2_squared (subview (theNorms, k),
2076  subview (X, ALL (), X_col));
2077  }
2078  else {
2079  TEUCHOS_TEST_FOR_EXCEPTION(true, std::logic_error, "Should never get here!");
2080  }
2081  } // for each column
2082  } // constantStride
2083  } // lclNumRows != 0
2084  }
2085 
2086  template<class RV>
2087  void
2088  gblNormImpl (const RV& normsOut,
2089  const Teuchos::RCP<const Teuchos::Comm<int> >& comm,
2090  const bool distributed,
2091  const EWhichNormImpl whichNorm)
2092  {
2093  using Teuchos::REDUCE_MAX;
2094  using Teuchos::REDUCE_SUM;
2095  using Teuchos::reduceAll;
2096  typedef typename RV::non_const_value_type mag_type;
2097 
2098  const size_t numVecs = normsOut.dimension_0 ();
2099 
2100  // If the MultiVector is distributed over multiple processes, do
2101  // the distributed (interprocess) part of the norm. We assume
2102  // that the MPI implementation can read from and write to device
2103  // memory.
2104  //
2105  // replaceMap() may have removed some processes. Those processes
2106  // have a null Map. They must not participate in any collective
2107  // operations. We ask first whether the Map is null, because
2108  // isDistributed() defers that question to the Map. We still
2109  // compute and return local norms for processes not participating
2110  // in collective operations; those probably don't make any sense,
2111  // but it doesn't hurt to do them, since it's illegal to call
2112  // norm*() on those processes anyway.
2113  if (distributed && ! comm.is_null ()) {
2114  // The calling process only participates in the collective if
2115  // both the Map and its Comm on that process are nonnull.
2116  //
2117  // MPI doesn't allow aliasing of arguments, so we have to make
2118  // a copy of the local sum.
2119  RV lclNorms ("MV::normImpl lcl", numVecs);
2120  Kokkos::deep_copy (lclNorms, normsOut);
2121  const mag_type* const lclSum = lclNorms.ptr_on_device ();
2122  mag_type* const gblSum = normsOut.ptr_on_device ();
2123  const int nv = static_cast<int> (numVecs);
2124  if (whichNorm == IMPL_NORM_INF) {
2125  reduceAll<int, mag_type> (*comm, REDUCE_MAX, nv, lclSum, gblSum);
2126  } else {
2127  reduceAll<int, mag_type> (*comm, REDUCE_SUM, nv, lclSum, gblSum);
2128  }
2129  }
2130 
2131  if (whichNorm == IMPL_NORM_TWO) {
2132  // Replace the norm-squared results with their square roots in
2133  // place, to get the final output. If the device memory and
2134  // the host memory are the same, it probably doesn't pay to
2135  // launch a parallel kernel for that, since there isn't enough
2136  // parallelism for the typical MultiVector case.
2137  const bool inHostMemory =
2138  Kokkos::Impl::is_same<typename RV::memory_space,
2139  typename RV::host_mirror_space::memory_space>::value;
2140  if (inHostMemory) {
2141  for (size_t j = 0; j < numVecs; ++j) {
2142  normsOut(j) = Kokkos::Details::ArithTraits<mag_type>::sqrt (normsOut(j));
2143  }
2144  }
2145  else {
2146  // There's not as much parallelism now, but that's OK. The
2147  // point of doing parallel dispatch here is to keep the norm
2148  // results on the device, thus avoiding a copy to the host and
2149  // back again.
2150  KokkosBlas::Impl::SquareRootFunctor<RV> f (normsOut);
2151  Kokkos::parallel_for (numVecs, f);
2152  }
2153  }
2154  }
2155 
2156  } // namespace (anonymous)
2157 
2158  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node, const bool classic>
2159  void
2160  MultiVector<Scalar, LocalOrdinal, GlobalOrdinal, Node, classic>::
2161  normImpl (const Kokkos::View<mag_type*, device_type>& norms,
2162  const EWhichNorm whichNorm) const
2163  {
2164  using Kokkos::create_mirror_view;
2165  using Kokkos::subview;
2166  using Teuchos::Comm;
2167  using Teuchos::null;
2168  using Teuchos::RCP;
2169  // View of all the norm results.
2170  typedef Kokkos::View<mag_type*, device_type> RV;
2171 
2172  const size_t numVecs = this->getNumVectors ();
2173  if (numVecs == 0) {
2174  return; // nothing to do
2175  }
2176  const size_t lclNumRows = this->getLocalLength ();
2177  const size_t numNorms = static_cast<size_t> (norms.dimension_0 ());
2178  TEUCHOS_TEST_FOR_EXCEPTION(
2179  numNorms < numVecs, std::runtime_error, "Tpetra::MultiVector::normImpl: "
2180  "'norms' must have at least as many entries as the number of vectors in "
2181  "*this. norms.dimension_0() = " << numNorms << " < this->getNumVectors()"
2182  " = " << numVecs << ".");
2183 
2184  const std::pair<size_t, size_t> colRng (0, numVecs);
2185  RV normsOut = subview (norms, colRng);
2186 
2187  EWhichNormImpl lclNormType;
2188  if (whichNorm == NORM_ONE) {
2189  lclNormType = IMPL_NORM_ONE;
2190  } else if (whichNorm == NORM_TWO) {
2191  lclNormType = IMPL_NORM_TWO;
2192  } else {
2193  lclNormType = IMPL_NORM_INF;
2194  }
2195 
2196  RCP<const Comm<int> > comm = this->getMap ().is_null () ? null :
2197  this->getMap ()->getComm ();
2198 
2199  // If we need sync to device, then host has the most recent version.
2200  const bool useHostVersion = this->template need_sync<device_type> ();
2201  if (useHostVersion) {
2202  // DualView was last modified on host, so run the local kernel there.
2203  // This means we need a host mirror of the array of norms too.
2204  typedef typename dual_view_type::t_host XMV;
2205  typedef typename XMV::memory_space cur_memory_space;
2206 
2207  auto thisView = this->template getLocalView<cur_memory_space> ();
2208  lclNormImpl<RV, XMV> (normsOut, thisView, lclNumRows, numVecs,
2209  this->whichVectors_, this->isConstantStride (),
2210  lclNormType);
2211  auto normsOutHost = Kokkos::create_mirror_view (normsOut);
2212  Kokkos::deep_copy (normsOutHost, normsOut);
2213  gblNormImpl (normsOutHost, comm, this->isDistributed (), lclNormType);
2214  Kokkos::deep_copy (normsOut, normsOutHost);
2215  }
2216  else {
2217  // DualView was last modified on device, so run the local kernel there.
2218  typedef typename dual_view_type::t_dev XMV;
2219  typedef typename XMV::memory_space cur_memory_space;
2220 
2221  auto thisView = this->template getLocalView<cur_memory_space> ();
2222  lclNormImpl<RV, XMV> (normsOut, thisView, lclNumRows, numVecs,
2223  this->whichVectors_, this->isConstantStride (),
2224  lclNormType);
2225  gblNormImpl (normsOut, comm, this->isDistributed (), lclNormType);
2226  }
2227  }
2228 
2229  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node, const bool classic>
2230  void
2232  meanValue (const Teuchos::ArrayView<impl_scalar_type>& means) const
2233  {
2234  // KR FIXME Overload this method to take a View.
2235  using Kokkos::ALL;
2236  using Kokkos::subview;
2237  using Teuchos::Comm;
2238  using Teuchos::RCP;
2239  using Teuchos::reduceAll;
2240  using Teuchos::REDUCE_SUM;
2241  typedef Kokkos::Details::ArithTraits<impl_scalar_type> ATS;
2242 
2243  const size_t lclNumRows = this->getLocalLength ();
2244  const size_t numVecs = this->getNumVectors ();
2245  const size_t numMeans = static_cast<size_t> (means.size ());
2246 
2247  TEUCHOS_TEST_FOR_EXCEPTION(
2248  numMeans != numVecs, std::runtime_error,
2249  "Tpetra::MultiVector::meanValue: means.size() = " << numMeans
2250  << " != this->getNumVectors() = " << numVecs << ".");
2251 
2252  const std::pair<size_t, size_t> rowRng (0, lclNumRows);
2253  const std::pair<size_t, size_t> colRng (0, numVecs);
2254 
2255  // Make sure that the final output view has the same layout as the
2256  // temporary view's HostMirror. Left or Right doesn't matter for
2257  // a 1-D array anyway; this is just to placate the compiler.
2258  typedef Kokkos::View<impl_scalar_type*, device_type> local_view_type;
2259  typedef Kokkos::View<impl_scalar_type*,
2260  typename local_view_type::HostMirror::array_layout,
2261  Kokkos::HostSpace,
2262  Kokkos::MemoryTraits<Kokkos::Unmanaged> > host_local_view_type;
2263  host_local_view_type meansOut (means.getRawPtr (), numMeans);
2264 
2265  RCP<const Comm<int> > comm = this->getMap ().is_null () ? Teuchos::null :
2266  this->getMap ()->getComm ();
2267 
2268  // If we need sync to device, then host has the most recent version.
2269  const bool useHostVersion = this->template need_sync<device_type> ();
2270  if (useHostVersion) {
2271  // DualView was last modified on host, so run the local kernel there.
2272  auto X_lcl = subview (this->template getLocalView<Kokkos::HostSpace> (),
2273  rowRng, Kokkos::ALL ());
2274  // Compute the local sum of each column.
2275  typename local_view_type::HostMirror lclSums ("MV::meanValue tmp", numVecs);
2276  if (isConstantStride ()) {
2277  KokkosBlas::sum (lclSums, X_lcl);
2278  }
2279  else {
2280  for (size_t j = 0; j < numVecs; ++j) {
2281  const size_t col = whichVectors_[j];
2282  KokkosBlas::sum (subview (lclSums, j), subview (X_lcl, ALL (), col));
2283  }
2284  }
2285 
2286  // If there are multiple MPI processes, the all-reduce reads
2287  // from lclSums, and writes to meansOut. Otherwise, we just
2288  // copy lclSums into meansOut.
2289  if (! comm.is_null () && this->isDistributed ()) {
2290  reduceAll (*comm, REDUCE_SUM, static_cast<int> (numVecs),
2291  lclSums.ptr_on_device (), meansOut.ptr_on_device ());
2292  }
2293  else {
2294  Kokkos::deep_copy (meansOut, lclSums);
2295  }
2296  }
2297  else {
2298  // DualView was last modified on device, so run the local kernel there.
2299  auto X_lcl = subview (this->template getLocalView<device_type> (),
2300  rowRng, Kokkos::ALL ());
2301 
2302  // Compute the local sum of each column.
2303  local_view_type lclSums ("MV::meanValue tmp", numVecs);
2304  if (isConstantStride ()) {
2305  KokkosBlas::sum (lclSums, X_lcl);
2306  }
2307  else {
2308  for (size_t j = 0; j < numVecs; ++j) {
2309  const size_t col = whichVectors_[j];
2310  KokkosBlas::sum (subview (lclSums, j), subview (X_lcl, ALL (), col));
2311  }
2312  }
2313 
2314  // If there are multiple MPI processes, the all-reduce reads
2315  // from lclSums, and writes to meansOut. (We assume that MPI
2316  // can read device memory.) Otherwise, we just copy lclSums
2317  // into meansOut.
2318  if (! comm.is_null () && this->isDistributed ()) {
2319  reduceAll (*comm, REDUCE_SUM, static_cast<int> (numVecs),
2320  lclSums.ptr_on_device (), meansOut.ptr_on_device ());
2321  }
2322  else {
2323  Kokkos::deep_copy (meansOut, lclSums);
2324  }
2325  }
2326 
2327  // mfh 12 Apr 2012: Don't take out the cast from the ordinal type
2328  // to the magnitude type, since operator/ (std::complex<T>, int)
2329  // isn't necessarily defined.
2330  const impl_scalar_type OneOverN =
2331  ATS::one () / static_cast<mag_type> (this->getGlobalLength ());
2332  for (size_t k = 0; k < numMeans; ++k) {
2333  meansOut(k) = meansOut(k) * OneOverN;
2334  }
2335  }
2336 
2337 
2338  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node, const bool classic>
2339  void
2342  {
2343  typedef impl_scalar_type IST;
2344  typedef Kokkos::Details::ArithTraits<IST> ATS;
2345  typedef Kokkos::Random_XorShift64_Pool<typename device_type::execution_space> pool_type;
2346  typedef typename pool_type::generator_type generator_type;
2347 
2348  const IST max = Kokkos::rand<generator_type, IST>::max ();
2349  const IST min = ATS::is_signed ? IST (-max) : ATS::zero ();
2350 
2351  this->randomize (static_cast<Scalar> (min), static_cast<Scalar> (max));
2352  }
2353 
2354 
2355  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node, const bool classic>
2356  void
2358  randomize (const Scalar& minVal, const Scalar& maxVal)
2359  {
2360  typedef impl_scalar_type IST;
2361  typedef Kokkos::Random_XorShift64_Pool<typename device_type::execution_space> pool_type;
2362 
2363  // Seed the pseudorandom number generator using the calling
2364  // process' rank. This helps decorrelate different process'
2365  // pseudorandom streams. It's not perfect but it's effective and
2366  // doesn't require MPI communication. The seed also includes bits
2367  // from the standard library's rand().
2368  //
2369  // FIXME (mfh 07 Jan 2015) Should we save the seed for later use?
2370  // The code below just makes a new seed each time.
2371 
2372  const uint64_t myRank =
2373  static_cast<uint64_t> (this->getMap ()->getComm ()->getRank ());
2374  uint64_t seed64 = static_cast<uint64_t> (std::rand ()) + myRank + 17311uLL;
2375  unsigned int seed = static_cast<unsigned int> (seed64&0xffffffff);
2376 
2377  pool_type rand_pool (seed);
2378  const IST max = static_cast<IST> (maxVal);
2379  const IST min = static_cast<IST> (minVal);
2380 
2381  this->template modify<device_type> ();
2382  auto thisView = this->getLocalView<device_type> ();
2383 
2384  if (isConstantStride ()) {
2385  Kokkos::fill_random (thisView, rand_pool, min, max);
2386  }
2387  else {
2388  const size_t numVecs = getNumVectors ();
2389  for (size_t k = 0; k < numVecs; ++k) {
2390  const size_t col = whichVectors_[k];
2391  auto X_k = Kokkos::subview (thisView, Kokkos::ALL (), col);
2392  Kokkos::fill_random (X_k, rand_pool, min, max);
2393  }
2394  }
2395  }
2396 
2397 
2398  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node, const bool classic>
2399  void
2401  putScalar (const Scalar& alpha)
2402  {
2403  using Kokkos::ALL;
2404  using Kokkos::deep_copy;
2405  using Kokkos::subview;
2406  typedef typename device_type::memory_space DMS;
2407  typedef Kokkos::HostSpace HMS;
2408 
2409  // We need this cast for cases like Scalar = std::complex<T> but
2410  // impl_scalar_type = Kokkos::complex<T>.
2411  const impl_scalar_type theAlpha = static_cast<impl_scalar_type> (alpha);
2412  const size_t lclNumRows = this->getLocalLength ();
2413  const size_t numVecs = this->getNumVectors ();
2414  const std::pair<size_t, size_t> rowRng (0, lclNumRows);
2415  const std::pair<size_t, size_t> colRng (0, numVecs);
2416 
2417  // Modify the most recently updated version of the data. This
2418  // avoids sync'ing, which could violate users' expectations.
2419  //
2420  // If we need sync to device, then host has the most recent version.
2421  const bool useHostVersion = this->template need_sync<device_type> ();
2422 
2423  if (! useHostVersion) { // last modified in device memory
2424  this->template modify<DMS> (); // we are about to modify on the device
2425  auto X = subview (this->template getLocalView<DMS> (), rowRng, ALL ());
2426  if (numVecs == 1) {
2427  auto X_0 = subview (X, ALL (), static_cast<size_t> (0));
2428  deep_copy (X_0, theAlpha);
2429  }
2430  else if (isConstantStride ()) {
2431  deep_copy (X, theAlpha);
2432  }
2433  else {
2434  for (size_t k = 0; k < numVecs; ++k) {
2435  const size_t col = whichVectors_[k];
2436  auto X_k = subview (X, ALL (), col);
2437  deep_copy (X_k, theAlpha);
2438  }
2439  }
2440  }
2441  else { // last modified in host memory, so modify data there.
2442  this->template modify<HMS> (); // we are about to modify on the host
2443  auto X = subview (this->template getLocalView<HMS> (), rowRng, ALL ());
2444  if (numVecs == 1) {
2445  auto X_0 = subview (X, ALL (), static_cast<size_t> (0));
2446  deep_copy (X_0, theAlpha);
2447  }
2448  else if (isConstantStride ()) {
2449  deep_copy (X, theAlpha);
2450  }
2451  else {
2452  for (size_t k = 0; k < numVecs; ++k) {
2453  const size_t col = whichVectors_[k];
2454  auto X_k = subview (X, ALL (), col);
2455  deep_copy (X_k, theAlpha);
2456  }
2457  }
2458  }
2459  }
2460 
2461 
2462  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node, const bool classic>
2463  void
2465  replaceMap (const Teuchos::RCP<const map_type>& newMap)
2466  {
2467  using Teuchos::ArrayRCP;
2468  using Teuchos::Comm;
2469  using Teuchos::RCP;
2470 
2471  // mfh 28 Mar 2013: This method doesn't forget whichVectors_, so
2472  // it might work if the MV is a column view of another MV.
2473  // However, things might go wrong when restoring the original
2474  // Map, so we don't allow this case for now.
2475  TEUCHOS_TEST_FOR_EXCEPTION(
2476  ! this->isConstantStride (), std::logic_error,
2477  "Tpetra::MultiVector::replaceMap: This method does not currently work "
2478  "if the MultiVector is a column view of another MultiVector (that is, if "
2479  "isConstantStride() == false).");
2480 
2481  // Case 1: current Map and new Map are both nonnull on this process.
2482  // Case 2: current Map is nonnull, new Map is null.
2483  // Case 3: current Map is null, new Map is nonnull.
2484  // Case 4: both Maps are null: forbidden.
2485  //
2486  // Case 1 means that we don't have to do anything on this process,
2487  // other than assign the new Map. (We always have to do that.)
2488  // It's an error for the user to supply a Map that requires
2489  // resizing in this case.
2490  //
2491  // Case 2 means that the calling process is in the current Map's
2492  // communicator, but will be excluded from the new Map's
2493  // communicator. We don't have to do anything on the calling
2494  // process; just leave whatever data it may have alone.
2495  //
2496  // Case 3 means that the calling process is excluded from the
2497  // current Map's communicator, but will be included in the new
2498  // Map's communicator. This means we need to (re)allocate the
2499  // local DualView if it does not have the right number of rows.
2500  // If the new number of rows is nonzero, we'll fill the newly
2501  // allocated local data with zeros, as befits a projection
2502  // operation.
2503  //
2504  // The typical use case for Case 3 is that the MultiVector was
2505  // first created with the Map with more processes, then that Map
2506  // was replaced with a Map with fewer processes, and finally the
2507  // original Map was restored on this call to replaceMap.
2508 
2509 #ifdef HAVE_TEUCHOS_DEBUG
2510  // mfh 28 Mar 2013: We can't check for compatibility across the
2511  // whole communicator, unless we know that the current and new
2512  // Maps are nonnull on _all_ participating processes.
2513  // TEUCHOS_TEST_FOR_EXCEPTION(
2514  // origNumProcs == newNumProcs && ! this->getMap ()->isCompatible (*map),
2515  // std::invalid_argument, "Tpetra::MultiVector::project: "
2516  // "If the input Map's communicator is compatible (has the same number of "
2517  // "processes as) the current Map's communicator, then the two Maps must be "
2518  // "compatible. The replaceMap() method is not for data redistribution; "
2519  // "use Import or Export for that purpose.");
2520 
2521  // TODO (mfh 28 Mar 2013) Add compatibility checks for projections
2522  // of the Map, in case the process counts don't match.
2523 #endif // HAVE_TEUCHOS_DEBUG
2524 
2525  if (this->getMap ().is_null ()) { // current Map is null
2526  // If this->getMap() is null, that means that this MultiVector
2527  // has already had replaceMap happen to it. In that case, just
2528  // reallocate the DualView with the right size.
2529 
2530  TEUCHOS_TEST_FOR_EXCEPTION(
2531  newMap.is_null (), std::invalid_argument,
2532  "Tpetra::MultiVector::replaceMap: both current and new Maps are null. "
2533  "This probably means that the input Map is incorrect.");
2534 
2535  // Case 3: current Map is null, new Map is nonnull.
2536  // Reallocate the DualView with the right dimensions.
2537  const size_t newNumRows = newMap->getNodeNumElements ();
2538  const size_t origNumRows = view_.dimension_0 ();
2539  const size_t numCols = this->getNumVectors ();
2540 
2541  if (origNumRows != newNumRows || view_.dimension_1 () != numCols) {
2542  view_ = allocDualView<Scalar, LocalOrdinal, GlobalOrdinal, Node> (newNumRows, numCols);
2543  }
2544  }
2545  else if (newMap.is_null ()) { // Case 2: current Map is nonnull, new Map is null
2546  // I am an excluded process. Reinitialize my data so that I
2547  // have 0 rows. Keep the number of columns as before.
2548  const size_t newNumRows = static_cast<size_t> (0);
2549  const size_t numCols = this->getNumVectors ();
2550  view_ = allocDualView<Scalar, LocalOrdinal, GlobalOrdinal, Node> (newNumRows, numCols);
2551  }
2552 
2553  this->map_ = newMap;
2554  }
2555 
2556  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node, const bool classic>
2557  void
2559  scale (const Scalar& alpha)
2560  {
2561  const impl_scalar_type theAlpha = static_cast<impl_scalar_type> (alpha);
2562  if (theAlpha == Kokkos::Details::ArithTraits<impl_scalar_type>::one ()) {
2563  return; // do nothing
2564  }
2565  const size_t lclNumRows = this->getLocalLength ();
2566  const size_t numVecs = this->getNumVectors ();
2567  const std::pair<size_t, size_t> rowRng (0, lclNumRows);
2568  const std::pair<size_t, size_t> colRng (0, numVecs);
2569 
2570  // We can't substitute putScalar(0.0) for scale(0.0), because the
2571  // former will overwrite NaNs present in the MultiVector. The
2572  // semantics of this call require multiplying them by 0, which
2573  // IEEE 754 requires to be NaN.
2574 
2575  // If we need sync to device, then host has the most recent version.
2576  const bool useHostVersion = this->template need_sync<device_type> ();
2577  if (useHostVersion) {
2578  auto Y_lcl =
2579  Kokkos::subview (this->template getLocalView<Kokkos::HostSpace> (),
2580  rowRng, Kokkos::ALL ());
2581  if (isConstantStride ()) {
2582  KokkosBlas::scal (Y_lcl, theAlpha, Y_lcl);
2583  }
2584  else {
2585  for (size_t k = 0; k < numVecs; ++k) {
2586  const size_t Y_col = this->isConstantStride () ? k :
2587  this->whichVectors_[k];
2588  auto Y_k = Kokkos::subview (Y_lcl, Kokkos::ALL (), Y_col);
2589  KokkosBlas::scal (Y_k, theAlpha, Y_k);
2590  }
2591  }
2592  }
2593  else { // work on device
2594  auto Y_lcl =
2595  Kokkos::subview (this->template getLocalView<device_type> (),
2596  rowRng, Kokkos::ALL ());
2597  if (isConstantStride ()) {
2598  KokkosBlas::scal (Y_lcl, theAlpha, Y_lcl);
2599  }
2600  else {
2601  for (size_t k = 0; k < numVecs; ++k) {
2602  const size_t Y_col = this->isConstantStride () ? k :
2603  this->whichVectors_[k];
2604  auto Y_k = Kokkos::subview (Y_lcl, Kokkos::ALL (), Y_col);
2605  KokkosBlas::scal (Y_k, theAlpha, Y_k);
2606  }
2607  }
2608  }
2609  }
2610 
2611 
2612  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node, const bool classic>
2613  void
2615  scale (const Teuchos::ArrayView<const Scalar>& alphas)
2616  {
2617  const size_t numVecs = this->getNumVectors ();
2618  const size_t numAlphas = static_cast<size_t> (alphas.size ());
2619  TEUCHOS_TEST_FOR_EXCEPTION(
2620  numAlphas != numVecs, std::invalid_argument, "Tpetra::MultiVector::"
2621  "scale: alphas.size() = " << numAlphas << " != this->getNumVectors() = "
2622  << numVecs << ".");
2623 
2624  // Use a DualView to copy the scaling constants onto the device.
2625  typedef Kokkos::DualView<impl_scalar_type*, device_type> k_alphas_type ;
2626  k_alphas_type k_alphas ("alphas::tmp", numAlphas);
2627  k_alphas.template modify<typename k_alphas_type::host_mirror_space> ();
2628  for (size_t i = 0; i < numAlphas; ++i) {
2629  k_alphas.h_view(i) = static_cast<impl_scalar_type> (alphas[i]);
2630  }
2631  k_alphas.template sync<typename k_alphas_type::memory_space> ();
2632  // Invoke the scale() overload that takes a device View of coefficients.
2633  this->scale (k_alphas.d_view);
2634  }
2635 
2636  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node, const bool classic>
2637  void
2639  scale (const Kokkos::View<const impl_scalar_type*, device_type>& alphas)
2640  {
2641  using Kokkos::ALL;
2642  using Kokkos::subview;
2643 
2644  const size_t lclNumRows = this->getLocalLength ();
2645  const size_t numVecs = this->getNumVectors ();
2646  TEUCHOS_TEST_FOR_EXCEPTION(
2647  static_cast<size_t> (alphas.dimension_0 ()) != numVecs,
2648  std::invalid_argument, "Tpetra::MultiVector::scale(alphas): "
2649  "alphas.dimension_0() = " << alphas.dimension_0 ()
2650  << " != this->getNumVectors () = " << numVecs << ".");
2651  const std::pair<size_t, size_t> rowRng (0, lclNumRows);
2652  const std::pair<size_t, size_t> colRng (0, numVecs);
2653 
2654  // NOTE (mfh 08 Apr 2015) We prefer to let the compiler deduce the
2655  // type of the return value of subview. This is because if we
2656  // switch the array layout from LayoutLeft to LayoutRight
2657  // (preferred for performance of block operations), the types
2658  // below won't be valid. (A view of a column of a LayoutRight
2659  // multivector has LayoutStride, not LayoutLeft.)
2660 
2661  // If we need sync to device, then host has the most recent version.
2662  const bool useHostVersion = this->template need_sync<device_type> ();
2663  if (useHostVersion) {
2664  // Work in host memory. This means we need to create a host
2665  // mirror of the input View of coefficients.
2666  auto alphas_h = Kokkos::create_mirror_view (alphas);
2667  typedef typename decltype (alphas_h)::memory_space cur_memory_space;
2668  Kokkos::deep_copy (alphas_h, alphas);
2669 
2670  auto Y_lcl =
2671  Kokkos::subview (this->template getLocalView<cur_memory_space> (),
2672  rowRng, Kokkos::ALL ());
2673  if (isConstantStride ()) {
2674  KokkosBlas::scal (Y_lcl, alphas_h, Y_lcl);
2675  }
2676  else {
2677  for (size_t k = 0; k < numVecs; ++k) {
2678  const size_t Y_col = this->isConstantStride () ? k :
2679  this->whichVectors_[k];
2680  auto Y_k = subview (Y_lcl, ALL (), Y_col);
2681  // We don't have to use the entire 1-D View here; we can use
2682  // the version that takes a scalar coefficient.
2683  KokkosBlas::scal (Y_k, alphas_h(k), Y_k);
2684  }
2685  }
2686  }
2687  else { // Work in device memory, using the input View 'alphas' directly.
2688  auto Y_lcl =
2689  Kokkos::subview (this->template getLocalView<device_type> (),
2690  rowRng, Kokkos::ALL());
2691  if (isConstantStride ()) {
2692  KokkosBlas::scal (Y_lcl, alphas, Y_lcl);
2693  }
2694  else {
2695  for (size_t k = 0; k < numVecs; ++k) {
2696  const size_t Y_col = this->isConstantStride () ? k :
2697  this->whichVectors_[k];
2698  auto Y_k = subview (Y_lcl, ALL (), Y_col);
2699  //
2700  // FIXME (mfh 08 Apr 2015) This assumes UVM. It would be
2701  // better to fix scal() so that it takes a 0-D View as the
2702  // second argument.
2703  //
2704  KokkosBlas::scal (Y_k, alphas(k), Y_k);
2705  }
2706  }
2707  }
2708  }
2709 
2710  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node, const bool classic>
2711  void
2713  scale (const Scalar& alpha,
2715  {
2716  using Kokkos::ALL;
2717  using Kokkos::subview;
2718  const char tfecfFuncName[] = "scale: ";
2719 
2720  const size_t lclNumRows = getLocalLength ();
2721  const size_t numVecs = getNumVectors ();
2722 
2723  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
2724  lclNumRows != A.getLocalLength (), std::invalid_argument,
2725  "this->getLocalLength() = " << lclNumRows << " != A.getLocalLength() = "
2726  << A.getLocalLength () << ".");
2727  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
2728  numVecs != A.getNumVectors (), std::invalid_argument,
2729  "this->getNumVectors() = " << numVecs << " != A.getNumVectors() = "
2730  << A.getNumVectors () << ".");
2731 
2732  const impl_scalar_type theAlpha = static_cast<impl_scalar_type> (alpha);
2733  const std::pair<size_t, size_t> rowRng (0, lclNumRows);
2734  const std::pair<size_t, size_t> colRng (0, numVecs);
2735 
2736  typedef typename dual_view_type::t_dev dev_view_type;
2737  typedef typename dual_view_type::t_host host_view_type;
2738 
2739  // If we need sync to device, then host has the most recent version.
2740  const bool useHostVersion = this->template need_sync<device_type> ();
2741  if (useHostVersion) {
2742  // Work on host, where A's data were most recently modified. A
2743  // is a "guest" of this method, so it's more polite to sync
2744  // *this, than to sync A.
2745  typedef typename host_view_type::memory_space cur_memory_space;
2746 
2747  this->template sync<cur_memory_space> ();
2748  this->template modify<cur_memory_space> ();
2749  auto Y_lcl_orig = this->template getLocalView<cur_memory_space> ();
2750  auto X_lcl_orig = A.template getLocalView<cur_memory_space> ();
2751  auto Y_lcl = subview (Y_lcl_orig, rowRng, Kokkos::ALL ());
2752  auto X_lcl = subview (X_lcl_orig, rowRng, Kokkos::ALL ());
2753 
2754  if (isConstantStride () && A.isConstantStride ()) {
2755  KokkosBlas::scal (Y_lcl, theAlpha, X_lcl);
2756  }
2757  else {
2758  // Make sure that Kokkos only uses the local length for add.
2759  for (size_t k = 0; k < numVecs; ++k) {
2760  const size_t Y_col = this->isConstantStride () ? k : this->whichVectors_[k];
2761  const size_t X_col = A.isConstantStride () ? k : A.whichVectors_[k];
2762  auto Y_k = subview (Y_lcl, ALL (), Y_col);
2763  auto X_k = subview (X_lcl, ALL (), X_col);
2764 
2765  KokkosBlas::scal (Y_k, theAlpha, X_k);
2766  }
2767  }
2768  }
2769  else { // work on device
2770  typedef typename dev_view_type::memory_space cur_memory_space;
2771 
2772  this->template sync<cur_memory_space> ();
2773  this->template modify<cur_memory_space> ();
2774  auto Y_lcl_orig = this->template getLocalView<cur_memory_space> ();
2775  auto X_lcl_orig = A.template getLocalView<cur_memory_space> ();
2776  auto Y_lcl = subview (Y_lcl_orig, rowRng, Kokkos::ALL ());
2777  auto X_lcl = subview (X_lcl_orig, rowRng, Kokkos::ALL ());
2778 
2779  if (isConstantStride () && A.isConstantStride ()) {
2780  KokkosBlas::scal (Y_lcl, theAlpha, X_lcl);
2781  }
2782  else {
2783  // Make sure that Kokkos only uses the local length for add.
2784  for (size_t k = 0; k < numVecs; ++k) {
2785  const size_t Y_col = this->isConstantStride () ? k : this->whichVectors_[k];
2786  const size_t X_col = A.isConstantStride () ? k : A.whichVectors_[k];
2787  auto Y_k = subview (Y_lcl, ALL (), Y_col);
2788  auto X_k = subview (X_lcl, ALL (), X_col);
2789 
2790  KokkosBlas::scal (Y_k, theAlpha, X_k);
2791  }
2792  }
2793  }
2794  }
2795 
2796 
2797  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node, const bool classic>
2798  void
2801  {
2803  const char tfecfFuncName[] = "reciprocal: ";
2804 
2805  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
2806  getLocalLength () != A.getLocalLength (), std::runtime_error,
2807  "MultiVectors do not have the same local length. "
2808  "this->getLocalLength() = " << getLocalLength ()
2809  << " != A.getLocalLength() = " << A.getLocalLength () << ".");
2810  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
2811  A.getNumVectors () != this->getNumVectors (), std::runtime_error,
2812  ": MultiVectors do not have the same number of columns (vectors). "
2813  "this->getNumVectors() = " << getNumVectors ()
2814  << " != A.getNumVectors() = " << A.getNumVectors () << ".");
2815 
2816  // FIXME (mfh 07 Jan 2015) See note on two-argument scale() above.
2817 
2818  const size_t numVecs = getNumVectors ();
2819  try {
2820  this->template sync<device_type> ();
2821  this->template modify<device_type> ();
2822  // FIXME (mfh 23 Jul 2014) I'm not sure if it should be our
2823  // responsibility to sync A.
2824  //
2825  // FIXME (mfh 18 May 2016) It's rude to sync the input
2826  // argument, since it is marked const.
2827  const_cast<MV&> (A).template sync<device_type> ();
2828 
2829  auto this_view_dev = this->template getLocalView<device_type> ();
2830  auto A_view_dev = A.template getLocalView<device_type> ();
2831 
2832  if (isConstantStride () && A.isConstantStride ()) {
2833  KokkosBlas::reciprocal (this_view_dev, A_view_dev);
2834  }
2835  else {
2836  using Kokkos::ALL;
2837  using Kokkos::subview;
2838  typedef Kokkos::View<impl_scalar_type*, device_type> view_type;
2839 
2840  for (size_t k = 0; k < numVecs; ++k) {
2841  const size_t this_col = isConstantStride () ? k : whichVectors_[k];
2842  view_type vector_k = subview (this_view_dev, ALL (), this_col);
2843  const size_t A_col = isConstantStride () ? k : A.whichVectors_[k];
2844  view_type vector_Ak = subview (A_view_dev, ALL (), A_col);
2845  KokkosBlas::reciprocal (vector_k, vector_Ak);
2846  }
2847  }
2848  }
2849  catch (std::runtime_error &e) {
2850  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(true, std::runtime_error,
2851  ": Caught exception from Kokkos: " << e.what () << std::endl);
2852  }
2853  }
2854 
2855 
2856  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node, const bool classic>
2857  void
2860  {
2862  const char tfecfFuncName[] = "abs";
2863  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
2864  getLocalLength () != A.getLocalLength (), std::runtime_error,
2865  ": MultiVectors do not have the same local length. "
2866  "this->getLocalLength() = " << getLocalLength ()
2867  << " != A.getLocalLength() = " << A.getLocalLength () << ".");
2868  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
2869  A.getNumVectors () != this->getNumVectors (), std::runtime_error,
2870  ": MultiVectors do not have the same number of columns (vectors). "
2871  "this->getNumVectors() = " << getNumVectors ()
2872  << " != A.getNumVectors() = " << A.getNumVectors () << ".");
2873 
2874  // FIXME (mfh 07 Jan 2015) See note on two-argument scale() above.
2875 
2876  const size_t numVecs = getNumVectors ();
2877 
2878  this->template sync<device_type> ();
2879  this->template modify<device_type> ();
2880  // FIXME (mfh 23 Jul 2014) I'm not sure if it should be our
2881  // responsibility to sync A.
2882  //
2883  // FIXME (mfh 18 May 2016) It's rude to sync the input
2884  // argument, since it is marked const.
2885  const_cast<MV&> (A).template sync<device_type> ();
2886 
2887  auto this_view_dev = this->template getLocalView<device_type> ();
2888  auto A_view_dev = A.template getLocalView<device_type> ();
2889 
2890  if (isConstantStride () && A.isConstantStride ()) {
2891  KokkosBlas::abs (this_view_dev, A_view_dev);
2892  }
2893  else {
2894  using Kokkos::ALL;
2895  using Kokkos::subview;
2896  typedef Kokkos::View<impl_scalar_type*, device_type> view_type;
2897 
2898  for (size_t k=0; k < numVecs; ++k) {
2899  const size_t this_col = isConstantStride () ? k : whichVectors_[k];
2900  view_type vector_k = subview (this_view_dev, ALL (), this_col);
2901  const size_t A_col = isConstantStride () ? k : A.whichVectors_[k];
2902  view_type vector_Ak = subview (A_view_dev, ALL (), A_col);
2903  KokkosBlas::abs (vector_k, vector_Ak);
2904  }
2905  }
2906  }
2907 
2908 
2909  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node, const bool classic>
2910  void
2912  update (const Scalar& alpha,
2914  const Scalar& beta)
2915  {
2916  using Kokkos::ALL;
2917  using Kokkos::subview;
2918  const char tfecfFuncName[] = "update: ";
2919 
2920  const size_t lclNumRows = getLocalLength ();
2921  const size_t numVecs = getNumVectors ();
2922 
2923  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
2924  lclNumRows != A.getLocalLength (), std::invalid_argument,
2925  "this->getLocalLength() = " << lclNumRows << " != A.getLocalLength() = "
2926  << A.getLocalLength () << ".");
2927  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
2928  numVecs != A.getNumVectors (), std::invalid_argument,
2929  "this->getNumVectors() = " << numVecs << " != A.getNumVectors() = "
2930  << A.getNumVectors () << ".");
2931 
2932  const impl_scalar_type theAlpha = static_cast<impl_scalar_type> (alpha);
2933  const impl_scalar_type theBeta = static_cast<impl_scalar_type> (beta);
2934  const std::pair<size_t, size_t> rowRng (0, lclNumRows);
2935  const std::pair<size_t, size_t> colRng (0, numVecs);
2936 
2937  typedef typename dual_view_type::t_dev dev_view_type;
2938  typedef typename dual_view_type::t_host host_view_type;
2939 
2940  // If we need sync to device, then host has the most recent version.
2941  const bool useHostVersion = this->template need_sync<device_type> ();
2942  if (useHostVersion) {
2943  // Work on host, where A's data were most recently modified. A
2944  // is a "guest" of this method, so it's more polite to sync
2945  // *this, than to sync A.
2946  typedef typename host_view_type::memory_space cur_memory_space;
2947 
2948  this->template sync<cur_memory_space> ();
2949  this->template modify<cur_memory_space> ();
2950  auto Y_lcl_orig = this->template getLocalView<cur_memory_space> ();
2951  auto Y_lcl = subview (Y_lcl_orig, rowRng, Kokkos::ALL ());
2952  auto X_lcl_orig = A.template getLocalView<cur_memory_space> ();
2953  auto X_lcl = subview (X_lcl_orig, rowRng, Kokkos::ALL ());
2954 
2955  if (isConstantStride () && A.isConstantStride ()) {
2956  KokkosBlas::axpby (theAlpha, X_lcl, theBeta, Y_lcl);
2957  }
2958  else {
2959  // Make sure that Kokkos only uses the local length for add.
2960  for (size_t k = 0; k < numVecs; ++k) {
2961  const size_t Y_col = this->isConstantStride () ? k : this->whichVectors_[k];
2962  const size_t X_col = A.isConstantStride () ? k : A.whichVectors_[k];
2963  auto Y_k = subview (Y_lcl, ALL (), Y_col);
2964  auto X_k = subview (X_lcl, ALL (), X_col);
2965 
2966  KokkosBlas::axpby (theAlpha, X_k, theBeta, Y_k);
2967  }
2968  }
2969  }
2970  else { // work on device
2971  // A is a "guest" of this method, so it's more polite to sync
2972  // *this, than to sync A.
2973  typedef typename dev_view_type::memory_space cur_memory_space;
2974  this->template sync<cur_memory_space> ();
2975  this->template modify<cur_memory_space> ();
2976  auto Y_lcl_orig = this->template getLocalView<cur_memory_space> ();
2977  auto Y_lcl = subview (Y_lcl_orig, rowRng, Kokkos::ALL ());
2978  auto X_lcl_orig = A.template getLocalView<cur_memory_space> ();
2979  auto X_lcl = subview (X_lcl_orig, rowRng, Kokkos::ALL ());
2980 
2981  if (isConstantStride () && A.isConstantStride ()) {
2982  KokkosBlas::axpby (theAlpha, X_lcl, theBeta, Y_lcl);
2983  }
2984  else {
2985  // Make sure that Kokkos only uses the local length for add.
2986  for (size_t k = 0; k < numVecs; ++k) {
2987  const size_t Y_col = this->isConstantStride () ? k : this->whichVectors_[k];
2988  const size_t X_col = A.isConstantStride () ? k : A.whichVectors_[k];
2989  auto Y_k = subview (Y_lcl, ALL (), Y_col);
2990  auto X_k = subview (X_lcl, ALL (), X_col);
2991 
2992  KokkosBlas::axpby (theAlpha, X_k, theBeta, Y_k);
2993  }
2994  }
2995  }
2996  }
2997 
2998  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node, const bool classic>
2999  void
3001  update (const Scalar& alpha,
3003  const Scalar& beta,
3005  const Scalar& gamma)
3006  {
3007  using Kokkos::ALL;
3008  using Kokkos::subview;
3010  const char tfecfFuncName[] = "update(alpha,A,beta,B,gamma): ";
3011 
3012  const size_t lclNumRows = this->getLocalLength ();
3013  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
3014  lclNumRows != A.getLocalLength (), std::invalid_argument,
3015  "The input MultiVector A has " << A.getLocalLength () << " local "
3016  "row(s), but this MultiVector has " << lclNumRows << " local "
3017  "row(s).");
3018  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
3019  lclNumRows != B.getLocalLength (), std::invalid_argument,
3020  "The input MultiVector B has " << B.getLocalLength () << " local "
3021  "row(s), but this MultiVector has " << lclNumRows << " local "
3022  "row(s).");
3023  const size_t numVecs = getNumVectors ();
3024  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
3025  A.getNumVectors () != numVecs, std::invalid_argument,
3026  "The input MultiVector A has " << A.getNumVectors () << " column(s), "
3027  "but this MultiVector has " << numVecs << " column(s).");
3028  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
3029  B.getNumVectors () != numVecs, std::invalid_argument,
3030  "The input MultiVector B has " << B.getNumVectors () << " column(s), "
3031  "but this MultiVector has " << numVecs << " column(s).");
3032 
3033  const impl_scalar_type theAlpha = static_cast<impl_scalar_type> (alpha);
3034  const impl_scalar_type theBeta = static_cast<impl_scalar_type> (beta);
3035  const impl_scalar_type theGamma = static_cast<impl_scalar_type> (gamma);
3036 
3037  // We're lucky if *this, A, and B are all sync'd to the same
3038  // memory space. If not, we have to sync _something_. Unlike
3039  // three-argument update() or (say) dot(), we may have to sync one
3040  // of the inputs. For now, we just sync _everything_ to device.
3041  this->template sync<device_type> ();
3042  const_cast<MV&> (A).template sync<device_type> ();
3043  const_cast<MV&> (B).template sync<device_type> ();
3044 
3045  // This method modifies *this.
3046  this->template modify<device_type> ();
3047 
3048  const std::pair<size_t, size_t> rowRng (0, lclNumRows);
3049  const std::pair<size_t, size_t> colRng (0, numVecs);
3050 
3051  // Prefer 'auto' over specifying the type explicitly. This avoids
3052  // issues with a subview possibly having a different type than the
3053  // original view.
3054  auto C_lcl = subview (this->template getLocalView<device_type> (), rowRng, ALL ());
3055  auto A_lcl = subview (A.template getLocalView<device_type> (), rowRng, ALL ());
3056  auto B_lcl = subview (B.template getLocalView<device_type> (), rowRng, ALL ());
3057 
3058  if (isConstantStride () && A.isConstantStride () && B.isConstantStride ()) {
3059  KokkosBlas::update (theAlpha, A_lcl, theBeta, B_lcl, theGamma, C_lcl);
3060  }
3061  else {
3062  // Some input (or *this) is not constant stride,
3063  // so perform the update one column at a time.
3064  for (size_t k = 0; k < numVecs; ++k) {
3065  const size_t this_col = isConstantStride () ? k : whichVectors_[k];
3066  const size_t A_col = A.isConstantStride () ? k : A.whichVectors_[k];
3067  const size_t B_col = B.isConstantStride () ? k : B.whichVectors_[k];
3068  KokkosBlas::update (theAlpha, subview (A_lcl, rowRng, A_col),
3069  theBeta, subview (B_lcl, rowRng, B_col),
3070  theGamma, subview (C_lcl, rowRng, this_col));
3071  }
3072  }
3073  }
3074 
3075  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node, const bool classic>
3076  Teuchos::ArrayRCP<const Scalar>
3078  getData (size_t j) const
3079  {
3080  using Kokkos::ALL;
3081  using Kokkos::subview;
3083 
3084  // Any MultiVector method that called the (classic) Kokkos Node's
3085  // viewBuffer or viewBufferNonConst methods always implied a
3086  // device->host synchronization. Thus, we synchronize here as
3087  // well.
3088  const_cast<MV*> (this)->template sync<Kokkos::HostSpace> ();
3089 
3090  // Get a host view of the entire MultiVector's data.
3091  auto hostView = this->template getLocalView<Kokkos::HostSpace> ();
3092 
3093  // Get a subview of column j.
3094  const size_t col = isConstantStride () ? j : whichVectors_[j];
3095  auto hostView_j = subview (hostView, ALL (), col);
3096 
3097  // Wrap up the subview of column j in an ArrayRCP<const impl_scalar_type>.
3098  Teuchos::ArrayRCP<const impl_scalar_type> dataAsArcp =
3099  Kokkos::Compat::persistingView (hostView_j, 0, getLocalLength ());
3100 
3101 #ifdef HAVE_TPETRA_DEBUG
3102  TEUCHOS_TEST_FOR_EXCEPTION
3103  (static_cast<size_t> (hostView_j.dimension_0 ()) <
3104  static_cast<size_t> (dataAsArcp.size ()), std::logic_error,
3105  "Tpetra::MultiVector::getData: hostView_j.dimension_0() = "
3106  << hostView_j.dimension_0 () << " < dataAsArcp.size() = "
3107  << dataAsArcp.size () << ". "
3108  "Please report this bug to the Tpetra developers.");
3109 #endif // HAVE_TPETRA_DEBUG
3110 
3111  return Teuchos::arcp_reinterpret_cast<const Scalar> (dataAsArcp);
3112  }
3113 
3114  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node, const bool classic>
3115  Teuchos::ArrayRCP<Scalar>
3118  {
3119  using Kokkos::ALL;
3120  using Kokkos::subview;
3122 
3123  // Any MultiVector method that called the (classic) Kokkos Node's
3124  // viewBuffer or viewBufferNonConst methods always implied a
3125  // device->host synchronization. Thus, we synchronize here as
3126  // well.
3127  const_cast<MV*> (this)->template sync<Kokkos::HostSpace> ();
3128 
3129  // Calling getDataNonConst() implies that the user plans to modify
3130  // the values in the MultiVector, so we mark the host data as modified.
3131  this->template modify<Kokkos::HostSpace> ();
3132 
3133  // Get a host view of the entire MultiVector's data.
3134  auto hostView = this->template getLocalView<Kokkos::HostSpace> ();
3135 
3136  // Get a subview of column j.
3137  const size_t col = isConstantStride () ? j : whichVectors_[j];
3138  auto hostView_j = subview (hostView, ALL (), col);
3139 
3140  // Wrap up the subview of column j in an ArrayRCP<const impl_scalar_type>.
3141  Teuchos::ArrayRCP<impl_scalar_type> dataAsArcp =
3142  Kokkos::Compat::persistingView (hostView_j, 0, getLocalLength ());
3143 
3144 #ifdef HAVE_TPETRA_DEBUG
3145  TEUCHOS_TEST_FOR_EXCEPTION
3146  (static_cast<size_t> (hostView_j.dimension_0 ()) <
3147  static_cast<size_t> (dataAsArcp.size ()), std::logic_error,
3148  "Tpetra::MultiVector::getDataNonConst: hostView_j.dimension_0() = "
3149  << hostView_j.dimension_0 () << " < dataAsArcp.size() = "
3150  << dataAsArcp.size () << ". "
3151  "Please report this bug to the Tpetra developers.");
3152 #endif // HAVE_TPETRA_DEBUG
3153 
3154  return Teuchos::arcp_reinterpret_cast<Scalar> (dataAsArcp);
3155  }
3156 
3157  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node, const bool classic>
3161  {
3162  if (this != &source) {
3163  base_type::operator= (source);
3164  //
3165  // operator= implements view semantics (shallow copy).
3166  //
3167 
3168  // Kokkos::View operator= also implements view semantics.
3169  view_ = source.view_;
3170  origView_ = source.origView_;
3171 
3172  // NOTE (mfh 24 Mar 2014) Christian wrote here that assigning
3173  // whichVectors_ is "probably not ok" (probably constitutes deep
3174  // copy). I would say that it's OK, because whichVectors_ is
3175  // immutable (from the user's perspective); it's analogous to
3176  // the dimensions or stride. Once we make whichVectors_ a
3177  // Kokkos::View instead of a Teuchos::Array, all debate will go
3178  // away and we will unquestionably have view semantics.
3179  whichVectors_ = source.whichVectors_;
3180  }
3181  return *this;
3182  }
3183 
3184 
3185  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node, const bool classic>
3186  Teuchos::RCP<MultiVector<Scalar, LocalOrdinal, GlobalOrdinal, Node, classic> >
3188  subCopy (const Teuchos::ArrayView<const size_t>& cols) const
3189  {
3190  using Teuchos::RCP;
3192 
3193  // Check whether the index set in cols is contiguous. If it is,
3194  // use the more efficient Range1D version of subCopy.
3195  bool contiguous = true;
3196  const size_t numCopyVecs = static_cast<size_t> (cols.size ());
3197  for (size_t j = 1; j < numCopyVecs; ++j) {
3198  if (cols[j] != cols[j-1] + static_cast<size_t> (1)) {
3199  contiguous = false;
3200  break;
3201  }
3202  }
3203  if (contiguous && numCopyVecs > 0) {
3204  return this->subCopy (Teuchos::Range1D (cols[0], cols[numCopyVecs-1]));
3205  }
3206  else {
3207  RCP<const MV> X_sub = this->subView (cols);
3208  RCP<MV> Y (new MV (this->getMap (), numCopyVecs, false));
3209  Y->assign (*X_sub);
3210  return Y;
3211  }
3212  }
3213 
3214  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node, const bool classic>
3215  Teuchos::RCP<MultiVector<Scalar, LocalOrdinal, GlobalOrdinal, Node, classic> >
3217  subCopy (const Teuchos::Range1D &colRng) const
3218  {
3219  using Teuchos::RCP;
3221 
3222  RCP<const MV> X_sub = this->subView (colRng);
3223  RCP<MV> Y (new MV (this->getMap (), static_cast<size_t> (colRng.size ()), false));
3224  Y->assign (*X_sub);
3225  return Y;
3226  }
3227 
3228  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node, const bool classic>
3229  size_t
3232  return origView_.dimension_0 ();
3233  }
3234 
3235  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node, const bool classic>
3236  size_t
3239  return origView_.dimension_1 ();
3240  }
3241 
3242  template <class Scalar, class LO, class GO, class Node, const bool classic>
3245  const map_type& subMap,
3246  const size_t offset) :
3247  base_type (Teuchos::null) // to be replaced below
3248  {
3249  using Kokkos::ALL;
3250  using Kokkos::subview;
3251  using Teuchos::RCP;
3252  using Teuchos::rcp;
3254  const char prefix[] = "Tpetra::MultiVector constructor (offsetView): ";
3255 
3256  const size_t newNumRows = subMap.getNodeNumElements ();
3257  const bool tooManyElts = newNumRows + offset > X.getOrigNumLocalRows ();
3258  if (tooManyElts) {
3259  const int myRank = X.getMap ()->getComm ()->getRank ();
3260  TEUCHOS_TEST_FOR_EXCEPTION(
3261  newNumRows + offset > X.getLocalLength (), std::runtime_error,
3262  prefix << "Invalid input Map. The input Map owns " << newNumRows <<
3263  " entries on process " << myRank << ". offset = " << offset << ". "
3264  "Yet, the MultiVector contains only " << X.getOrigNumLocalRows () <<
3265  " rows on this process.");
3266  }
3267 
3268 #ifdef HAVE_TPETRA_DEBUG
3269  const size_t strideBefore =
3270  X.isConstantStride () ? X.getStride () : static_cast<size_t> (0);
3271  const size_t lclNumRowsBefore = X.getLocalLength ();
3272  const size_t numColsBefore = X.getNumVectors ();
3273  const impl_scalar_type* hostPtrBefore =
3274  X.template getLocalView<Kokkos::HostSpace> ().ptr_on_device ();
3275 #endif // HAVE_TPETRA_DEBUG
3276 
3277  const std::pair<size_t, size_t> origRowRng (offset, X.origView_.dimension_0 ());
3278  const std::pair<size_t, size_t> rowRng (offset, offset + newNumRows);
3279 
3280  dual_view_type newOrigView = subview (X.origView_, origRowRng, ALL ());
3281  // FIXME (mfh 29 Sep 2016) If we just use X.view_ here, it breaks
3282  // CrsMatrix's Gauss-Seidel implementation (which assumes the
3283  // ability to create domain Map views of column Map MultiVectors,
3284  // and then get the original column Map MultiVector out again).
3285  // If we just use X.origView_ here, it breaks the fix for #46.
3286  // The test for offset == 0 is a hack that makes both tests pass,
3287  // but doesn't actually fix the more general issue. In
3288  // particular, the right way to fix Gauss-Seidel would be to fix
3289  // #385; that would make "getting the original column Map
3290  // MultiVector out again" unnecessary.
3291  dual_view_type newView = subview (offset == 0 ? X.origView_ : X.view_, rowRng, ALL ());
3292 
3293  // NOTE (mfh 06 Jan 2015) Work-around to deal with Kokkos not
3294  // handling subviews of degenerate Views quite so well. For some
3295  // reason, the ([0,0], [0,2]) subview of a 0 x 2 DualView is 0 x
3296  // 0. We work around by creating a new empty DualView of the
3297  // desired (degenerate) dimensions.
3298  if (newOrigView.dimension_0 () == 0 &&
3299  newOrigView.dimension_1 () != X.origView_.dimension_1 ()) {
3300  newOrigView = allocDualView<Scalar, LO, GO, Node> (size_t (0),
3301  X.getNumVectors ());
3302  }
3303  if (newView.dimension_0 () == 0 &&
3304  newView.dimension_1 () != X.view_.dimension_1 ()) {
3305  newView = allocDualView<Scalar, LO, GO, Node> (size_t (0),
3306  X.getNumVectors ());
3307  }
3308 
3309  MV subViewMV = X.isConstantStride () ?
3310  MV (Teuchos::rcp (new map_type (subMap)), newView, newOrigView) :
3311  MV (Teuchos::rcp (new map_type (subMap)), newView, newOrigView, X.whichVectors_ ());
3312 
3313 #ifdef HAVE_TPETRA_DEBUG
3314  const size_t strideAfter = X.isConstantStride () ?
3315  X.getStride () :
3316  static_cast<size_t> (0);
3317  const size_t lclNumRowsAfter = X.getLocalLength ();
3318  const size_t numColsAfter = X.getNumVectors ();
3319  const impl_scalar_type* hostPtrAfter =
3320  X.template getLocalView<Kokkos::HostSpace> ().ptr_on_device ();
3321 
3322  const size_t strideRet = subViewMV.isConstantStride () ?
3323  subViewMV.getStride () :
3324  static_cast<size_t> (0);
3325  const size_t lclNumRowsRet = subViewMV.getLocalLength ();
3326  const size_t numColsRet = subViewMV.getNumVectors ();
3327 
3328  const char suffix[] = ". This should never happen. Please report this "
3329  "bug to the Tpetra developers.";
3330 
3331  TEUCHOS_TEST_FOR_EXCEPTION(
3332  lclNumRowsRet != subMap.getNodeNumElements (),
3333  std::logic_error, prefix << "Returned MultiVector has a number of rows "
3334  "different than the number of local indices in the input Map. "
3335  "lclNumRowsRet: " << lclNumRowsRet << ", subMap.getNodeNumElements(): "
3336  << subMap.getNodeNumElements () << suffix);
3337  TEUCHOS_TEST_FOR_EXCEPTION(
3338  strideBefore != strideAfter || lclNumRowsBefore != lclNumRowsAfter ||
3339  numColsBefore != numColsAfter || hostPtrBefore != hostPtrAfter,
3340  std::logic_error, prefix << "Original MultiVector changed dimensions, "
3341  "stride, or host pointer after taking offset view. strideBefore: " <<
3342  strideBefore << ", strideAfter: " << strideAfter << ", lclNumRowsBefore: "
3343  << lclNumRowsBefore << ", lclNumRowsAfter: " << lclNumRowsAfter <<
3344  ", numColsBefore: " << numColsBefore << ", numColsAfter: " <<
3345  numColsAfter << ", hostPtrBefore: " << hostPtrBefore << ", hostPtrAfter: "
3346  << hostPtrAfter << suffix);
3347  TEUCHOS_TEST_FOR_EXCEPTION(
3348  strideBefore != strideRet, std::logic_error, prefix << "Returned "
3349  "MultiVector has different stride than original MultiVector. "
3350  "strideBefore: " << strideBefore << ", strideRet: " << strideRet <<
3351  ", numColsBefore: " << numColsBefore << ", numColsRet: " << numColsRet
3352  << suffix);
3353  TEUCHOS_TEST_FOR_EXCEPTION(
3354  numColsBefore != numColsRet, std::logic_error,
3355  prefix << "Returned MultiVector has a different number of columns than "
3356  "original MultiVector. numColsBefore: " << numColsBefore << ", "
3357  "numColsRet: " << numColsRet << suffix);
3358 #endif // HAVE_TPETRA_DEBUG
3359 
3360  *this = subViewMV; // shallow copy
3361  }
3362 
3363  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node, const bool classic>
3364  Teuchos::RCP<const MultiVector<Scalar, LocalOrdinal, GlobalOrdinal, Node, classic> >
3366  offsetView (const Teuchos::RCP<const map_type>& subMap,
3367  const size_t offset) const
3368  {
3370  return Teuchos::rcp (new MV (*this, *subMap, offset));
3371  }
3372 
3373  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node, const bool classic>
3374  Teuchos::RCP<MultiVector<Scalar, LocalOrdinal, GlobalOrdinal, Node, classic> >
3376  offsetViewNonConst (const Teuchos::RCP<const map_type>& subMap,
3377  const size_t offset)
3378  {
3380  return Teuchos::rcp (new MV (*this, *subMap, offset));
3381  }
3382 
3383  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node, const bool classic>
3384  Teuchos::RCP<const MultiVector<Scalar, LocalOrdinal, GlobalOrdinal, Node, classic> >
3386  subView (const Teuchos::ArrayView<const size_t>& cols) const
3387  {
3388  using Teuchos::Array;
3389  using Teuchos::rcp;
3391 
3392  const size_t numViewCols = static_cast<size_t> (cols.size ());
3393  TEUCHOS_TEST_FOR_EXCEPTION(
3394  numViewCols < 1, std::runtime_error, "Tpetra::MultiVector::subView"
3395  "(const Teuchos::ArrayView<const size_t>&): The input array cols must "
3396  "contain at least one entry, but cols.size() = " << cols.size ()
3397  << " == 0.");
3398 
3399  // Check whether the index set in cols is contiguous. If it is,
3400  // use the more efficient Range1D version of subView.
3401  bool contiguous = true;
3402  for (size_t j = 1; j < numViewCols; ++j) {
3403  if (cols[j] != cols[j-1] + static_cast<size_t> (1)) {
3404  contiguous = false;
3405  break;
3406  }
3407  }
3408  if (contiguous) {
3409  if (numViewCols == 0) {
3410  // The output MV has no columns, so there is nothing to view.
3411  return rcp (new MV (this->getMap (), numViewCols));
3412  } else {
3413  // Use the more efficient contiguous-index-range version.
3414  return this->subView (Teuchos::Range1D (cols[0], cols[numViewCols-1]));
3415  }
3416  }
3417 
3418  if (isConstantStride ()) {
3419  return rcp (new MV (this->getMap (), view_, origView_, cols));
3420  }
3421  else {
3422  Array<size_t> newcols (cols.size ());
3423  for (size_t j = 0; j < numViewCols; ++j) {
3424  newcols[j] = whichVectors_[cols[j]];
3425  }
3426  return rcp (new MV (this->getMap (), view_, origView_, newcols ()));
3427  }
3428  }
3429 
3430 
3431  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node, const bool classic>
3432  Teuchos::RCP<const MultiVector<Scalar, LocalOrdinal, GlobalOrdinal, Node, classic> >
3434  subView (const Teuchos::Range1D& colRng) const
3435  {
3436  using Kokkos::ALL;
3437  using Kokkos::subview;
3438  using Teuchos::Array;
3439  using Teuchos::RCP;
3440  using Teuchos::rcp;
3442  const char tfecfFuncName[] = "subView(Range1D): ";
3443 
3444  const size_t lclNumRows = this->getLocalLength ();
3445  const size_t numVecs = this->getNumVectors ();
3446  // TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
3447  // colRng.size() == 0, std::runtime_error, prefix << "Range must include "
3448  // "at least one vector.");
3449  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
3450  static_cast<size_t> (colRng.size ()) > numVecs, std::runtime_error,
3451  "colRng.size() = " << colRng.size () << " > this->getNumVectors() = "
3452  << numVecs << ".");
3453  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
3454  numVecs != 0 && colRng.size () != 0 &&
3455  (colRng.lbound () < static_cast<Teuchos::Ordinal> (0) ||
3456  static_cast<size_t> (colRng.ubound ()) >= numVecs),
3457  std::invalid_argument, "Nonempty input range [" << colRng.lbound () <<
3458  "," << colRng.ubound () << "] exceeds the valid range of column indices "
3459  "[0, " << numVecs << "].");
3460 
3461  RCP<const MV> X_ret; // the MultiVector subview to return
3462 
3463  // FIXME (mfh 14 Apr 2015) Apparently subview on DualView is still
3464  // broken for the case of views with zero rows. I will brutally
3465  // enforce that the subview has the correct dimensions. In
3466  // particular, in the case of zero rows, I will, if necessary,
3467  // create a new dual_view_type with zero rows and the correct
3468  // number of columns. In a debug build, I will use an all-reduce
3469  // to ensure that it has the correct dimensions on all processes.
3470 
3471  const std::pair<size_t, size_t> rows (0, lclNumRows);
3472  if (colRng.size () == 0) {
3473  const std::pair<size_t, size_t> cols (0, 0); // empty range
3474  dual_view_type X_sub = takeSubview (this->view_, ALL (), cols);
3475  X_ret = rcp (new MV (this->getMap (), X_sub, origView_));
3476  }
3477  else {
3478  // Returned MultiVector is constant stride only if *this is.
3479  if (isConstantStride ()) {
3480  const std::pair<size_t, size_t> cols (colRng.lbound (),
3481  colRng.ubound () + 1);
3482  dual_view_type X_sub = takeSubview (this->view_, ALL (), cols);
3483  X_ret = rcp (new MV (this->getMap (), X_sub, origView_));
3484  }
3485  else {
3486  if (static_cast<size_t> (colRng.size ()) == static_cast<size_t> (1)) {
3487  // We're only asking for one column, so the result does have
3488  // constant stride, even though this MultiVector does not.
3489  const std::pair<size_t, size_t> col (whichVectors_[0] + colRng.lbound (),
3490  whichVectors_[0] + colRng.ubound () + 1);
3491  dual_view_type X_sub = takeSubview (view_, ALL (), col);
3492  X_ret = rcp (new MV (this->getMap (), X_sub, origView_));
3493  }
3494  else {
3495  Array<size_t> which (whichVectors_.begin () + colRng.lbound (),
3496  whichVectors_.begin () + colRng.ubound () + 1);
3497  X_ret = rcp (new MV (this->getMap (), view_, origView_, which));
3498  }
3499  }
3500  }
3501 
3502 #ifdef HAVE_TPETRA_DEBUG
3503  using Teuchos::Comm;
3504  using Teuchos::outArg;
3505  using Teuchos::REDUCE_MIN;
3506  using Teuchos::reduceAll;
3507 
3508  RCP<const Comm<int> > comm = this->getMap ().is_null () ? Teuchos::null :
3509  this->getMap ()->getComm ();
3510  if (! comm.is_null ()) {
3511  int lclSuccess = 1;
3512  int gblSuccess = 1;
3513 
3514  if (X_ret.is_null ()) {
3515  lclSuccess = 0;
3516  }
3517  reduceAll<int, int> (*comm, REDUCE_MIN, lclSuccess, outArg (gblSuccess));
3518  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
3519  lclSuccess != 1, std::logic_error, "X_ret (the subview of this "
3520  "MultiVector; the return value of this method) is null on some MPI "
3521  "process in this MultiVector's communicator. This should never "
3522  "happen. Please report this bug to the Tpetra developers.");
3523 
3524  if (! X_ret.is_null () &&
3525  X_ret->getNumVectors () != static_cast<size_t> (colRng.size ())) {
3526  lclSuccess = 0;
3527  }
3528  reduceAll<int, int> (*comm, REDUCE_MIN, lclSuccess, outArg (gblSuccess));
3529  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
3530  lclSuccess != 1, std::logic_error,
3531  "X_ret->getNumVectors() != colRng.size(), on at least one MPI process "
3532  "in this MultiVector's communicator. This should never happen. "
3533  "Please report this bug to the Tpetra developers.");
3534  }
3535 #endif // HAVE_TPETRA_DEBUG
3536 
3537  return X_ret;
3538  }
3539 
3540 
3541  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node, const bool classic>
3542  Teuchos::RCP<MultiVector<Scalar, LocalOrdinal, GlobalOrdinal, Node, classic> >
3544  subViewNonConst (const Teuchos::ArrayView<const size_t> &cols)
3545  {
3547  return Teuchos::rcp_const_cast<MV> (this->subView (cols));
3548  }
3549 
3550 
3551  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node, const bool classic>
3552  Teuchos::RCP<MultiVector<Scalar, LocalOrdinal, GlobalOrdinal, Node, classic> >
3554  subViewNonConst (const Teuchos::Range1D &colRng)
3555  {
3557  return Teuchos::rcp_const_cast<MV> (this->subView (colRng));
3558  }
3559 
3560 
3561  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node, const bool classic>
3562  Teuchos::RCP<const Vector<Scalar, LocalOrdinal, GlobalOrdinal, Node, classic> >
3564  getVector (const size_t j) const
3565  {
3566  using Kokkos::ALL;
3567  using Kokkos::subview;
3568  using Teuchos::rcp;
3570 
3571 #ifdef HAVE_TPETRA_DEBUG
3572  const char tfecfFuncName[] = "getVector(NonConst): ";
3573  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
3574  this->vectorIndexOutOfRange (j), std::runtime_error, "Input index j (== "
3575  << j << ") exceeds valid range [0, " << this->getNumVectors ()
3576  << " - 1].");
3577 #endif // HAVE_TPETRA_DEBUG
3578  const size_t jj = this->isConstantStride () ?
3579  static_cast<size_t> (j) :
3580  static_cast<size_t> (this->whichVectors_[j]);
3581  const std::pair<size_t, size_t> rng (jj, jj+1);
3582  return rcp (new V (this->getMap (),
3583  takeSubview (this->view_, ALL (), rng),
3584  origView_));
3585  }
3586 
3587 
3588  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node, const bool classic>
3589  Teuchos::RCP<Vector<Scalar, LocalOrdinal, GlobalOrdinal, Node, classic> >
3591  getVectorNonConst (const size_t j)
3592  {
3594  return Teuchos::rcp_const_cast<V> (this->getVector (j));
3595  }
3596 
3597 
3598  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node, const bool classic>
3599  void
3601  get1dCopy (const Teuchos::ArrayView<Scalar>& A, const size_t LDA) const
3602  {
3603  typedef decltype (this->template getLocalView<device_type> ())
3604  dev_view_type;
3605  typedef decltype (this->template getLocalView<Kokkos::HostSpace> ())
3606  host_view_type;
3607  typedef impl_scalar_type IST;
3608  typedef Kokkos::View<IST*, typename host_view_type::array_layout,
3609  Kokkos::HostSpace, Kokkos::MemoryUnmanaged> input_col_type;
3610  const char tfecfFuncName[] = "get1dCopy: ";
3611 
3612  const size_t numRows = this->getLocalLength ();
3613  const size_t numCols = this->getNumVectors ();
3614  const std::pair<size_t, size_t> rowRange (0, numRows);
3615 
3616  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
3617  LDA < numRows, std::runtime_error,
3618  "LDA = " << LDA << " < numRows = " << numRows << ".");
3619  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
3620  numRows > static_cast<size_t> (0) &&
3621  numCols > static_cast<size_t> (0) &&
3622  static_cast<size_t> (A.size ()) < LDA * (numCols - 1) + numRows,
3623  std::runtime_error,
3624  "A.size() = " << A.size () << ", but its size must be at least "
3625  << (LDA * (numCols - 1) + numRows) << " to hold all the entries.");
3626 
3627  // FIXME (mfh 22 Jul 2014, 10 Dec 2014) Currently, it doesn't work
3628  // to do a 2-D copy, even if this MultiVector has constant stride.
3629  // This is because Kokkos can't currently tell the difference
3630  // between padding (which permits a single deep_copy for the whole
3631  // 2-D View) and stride > numRows (which does NOT permit a single
3632  // deep_copy for the whole 2-D View). Carter is working on this,
3633  // but for now, the temporary fix is to copy one column at a time.
3634 
3635  // Use the most recently updated version of this MultiVector's
3636  // data. This avoids sync'ing, which could violate users'
3637  // expectations.
3638  //
3639  // If we need sync to device, then host has the most recent version.
3640  const bool useHostVersion = this->template need_sync<device_type> ();
3641 
3642  dev_view_type srcView_dev;
3643  host_view_type srcView_host;
3644  if (useHostVersion) {
3645  srcView_host = this->template getLocalView<Kokkos::HostSpace> ();
3646  }
3647  else {
3648  srcView_dev = this->template getLocalView<device_type> ();
3649  }
3650 
3651  for (size_t j = 0; j < numCols; ++j) {
3652  const size_t srcCol =
3653  this->isConstantStride () ? j : this->whichVectors_[j];
3654  const size_t dstCol = j;
3655  IST* const dstColRaw =
3656  reinterpret_cast<IST*> (A.getRawPtr () + LDA * dstCol);
3657  input_col_type dstColView (dstColRaw, numRows);
3658 
3659  if (useHostVersion) {
3660  auto srcColView_host = Kokkos::subview (srcView_host, rowRange, srcCol);
3661  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
3662  dstColView.dimension_0 () != srcColView_host.dimension_0 (),
3663  std::logic_error, ": srcColView and dstColView_host have different "
3664  "dimensions. Please report this bug to the Tpetra developers.");
3665  Kokkos::deep_copy (dstColView, srcColView_host);
3666  }
3667  else {
3668  auto srcColView_dev = Kokkos::subview (srcView_dev, rowRange, srcCol);
3669  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
3670  dstColView.dimension_0 () != srcColView_dev.dimension_0 (),
3671  std::logic_error, ": srcColView and dstColView_dev have different "
3672  "dimensions. Please report this bug to the Tpetra developers.");
3673  Kokkos::deep_copy (dstColView, srcColView_dev);
3674  }
3675  }
3676  }
3677 
3678 
3679  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node, const bool classic>
3680  void
3682  get2dCopy (const Teuchos::ArrayView<const Teuchos::ArrayView<Scalar> >& ArrayOfPtrs) const
3683  {
3685  const char tfecfFuncName[] = "get2dCopy: ";
3686  const size_t numRows = this->getLocalLength ();
3687  const size_t numCols = this->getNumVectors ();
3688 
3689  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
3690  static_cast<size_t> (ArrayOfPtrs.size ()) != numCols,
3691  std::runtime_error, "Input array of pointers must contain as many "
3692  "entries (arrays) as the MultiVector has columns. ArrayOfPtrs.size() = "
3693  << ArrayOfPtrs.size () << " != getNumVectors() = " << numCols << ".");
3694 
3695  if (numRows != 0 && numCols != 0) {
3696  // No side effects until we've validated the input.
3697  for (size_t j = 0; j < numCols; ++j) {
3698  const size_t dstLen = static_cast<size_t> (ArrayOfPtrs[j].size ());
3699  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
3700  dstLen < numRows, std::invalid_argument, "Array j = " << j << " of "
3701  "the input array of arrays is not long enough to fit all entries in "
3702  "that column of the MultiVector. ArrayOfPtrs[j].size() = " << dstLen
3703  << " < getLocalLength() = " << numRows << ".");
3704  }
3705 
3706  // We've validated the input, so it's safe to start copying.
3707  for (size_t j = 0; j < numCols; ++j) {
3708  Teuchos::RCP<const V> X_j = this->getVector (j);
3709  const size_t LDA = static_cast<size_t> (ArrayOfPtrs[j].size ());
3710  X_j->get1dCopy (ArrayOfPtrs[j], LDA);
3711  }
3712  }
3713  }
3714 
3715  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node, const bool classic>
3716  Teuchos::ArrayRCP<const Scalar>
3718  get1dView () const
3719  {
3720  if (getLocalLength () == 0 || getNumVectors () == 0) {
3721  return Teuchos::null;
3722  } else {
3724 
3725  TEUCHOS_TEST_FOR_EXCEPTION(
3726  ! isConstantStride (), std::runtime_error, "Tpetra::MultiVector::"
3727  "get1dView: This MultiVector does not have constant stride, so it is "
3728  "not possible to view its data as a single array. You may check "
3729  "whether a MultiVector has constant stride by calling "
3730  "isConstantStride().");
3731  // mfh 18 May 2016: get?dView() and get?dViewNonConst() (replace
3732  // ? with 1 or 2) have always been device->host synchronization
3733  // points, since <= 2012. We retain this behavior for backwards
3734  // compatibility.
3735  //
3736  // Yes, "const" is a lie.
3737  const_cast<MV*> (this)->template sync<Kokkos::HostSpace> ();
3738  // Both get1dView() and get1dViewNonConst() return a host view
3739  // of the data.
3740  auto X_lcl = this->template getLocalView<Kokkos::HostSpace> ();
3741  Teuchos::ArrayRCP<const impl_scalar_type> dataAsArcp =
3742  Kokkos::Compat::persistingView (X_lcl);
3743  return Teuchos::arcp_reinterpret_cast<const Scalar> (dataAsArcp);
3744  }
3745  }
3746 
3747  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node, const bool classic>
3748  Teuchos::ArrayRCP<Scalar>
3751  {
3752  if (getLocalLength () == 0 || getNumVectors () == 0) {
3753  return Teuchos::null;
3754  } else {
3755  TEUCHOS_TEST_FOR_EXCEPTION
3756  (! isConstantStride (), std::runtime_error, "Tpetra::MultiVector::"
3757  "get1dViewNonConst: This MultiVector does not have constant stride, "
3758  "so it is not possible to view its data as a single array. You may "
3759  "check whether a MultiVector has constant stride by calling "
3760  "isConstantStride().");
3761  // get1dView() and get1dViewNonConst() have always been
3762  // device->host synchronization points, since <= 2012. We
3763  // retain this behavior for backwards compatibility.
3764  this->template sync<Kokkos::HostSpace> ();
3765  // Both get1dView() and get1dViewNonConst() return a host view
3766  // of the data.
3767  auto X_lcl = this->template getLocalView<Kokkos::HostSpace> ();
3768  Teuchos::ArrayRCP<impl_scalar_type> dataAsArcp =
3769  Kokkos::Compat::persistingView (X_lcl);
3770  return Teuchos::arcp_reinterpret_cast<Scalar> (dataAsArcp);
3771  }
3772  }
3773 
3774  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node, const bool classic>
3775  Teuchos::ArrayRCP<Teuchos::ArrayRCP<Scalar> >
3778  {
3779  // NOTE (mfh 16 May 2016) get?dView() and get?dViewNonConst()
3780  // (replace ? with 1 or 2) have always been device->host
3781  // synchronization points, since <= 2012. We retain this behavior
3782  // for backwards compatibility.
3783  this->sync<Kokkos::HostSpace> ();
3784  // When users call the NonConst variants, it implies that they
3785  // want to change the data. Thus, it is appropriate to mark this
3786  // MultiVector as modified.
3787  this->modify<Kokkos::HostSpace> ();
3788 
3789  const size_t myNumRows = this->getLocalLength ();
3790  const size_t numCols = this->getNumVectors ();
3791  const Kokkos::pair<size_t, size_t> rowRange (0, myNumRows);
3792  // Don't use the row range here on the outside, in order to avoid
3793  // a strided return type (in case Kokkos::subview is conservative
3794  // about that). Instead, use the row range for the column views
3795  // in the loop.
3796  auto X_lcl = this->getLocalView<Kokkos::HostSpace> ();
3797 
3798  Teuchos::ArrayRCP<Teuchos::ArrayRCP<Scalar> > views (numCols);
3799  for (size_t j = 0; j < numCols; ++j) {
3800  const size_t col = this->isConstantStride () ? j : this->whichVectors_[j];
3801  auto X_lcl_j = Kokkos::subview (X_lcl, rowRange, col);
3802  Teuchos::ArrayRCP<impl_scalar_type> X_lcl_j_arcp =
3803  Kokkos::Compat::persistingView (X_lcl_j);
3804  views[j] = Teuchos::arcp_reinterpret_cast<Scalar> (X_lcl_j_arcp);
3805  }
3806  return views;
3807  }
3808 
3809  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node, const bool classic>
3810  Teuchos::ArrayRCP<Teuchos::ArrayRCP<const Scalar> >
3812  get2dView () const
3813  {
3814  // NOTE (mfh 16 May 2016) get?dView() and get?dViewNonConst()
3815  // (replace ? with 1 or 2) have always been device->host
3816  // synchronization points, since <= 2012. We retain this behavior
3817  // for backwards compatibility.
3818  //
3819  // Since get2dView() is and was (unfortunately) always marked
3820  // const, I have to cast away const here in order not to break
3821  // backwards compatibility.
3823  const_cast<this_type*> (this)->sync<Kokkos::HostSpace> ();
3824 
3825  const size_t myNumRows = this->getLocalLength ();
3826  const size_t numCols = this->getNumVectors ();
3827  const Kokkos::pair<size_t, size_t> rowRange (0, myNumRows);
3828  // Don't use the row range here on the outside, in order to avoid
3829  // a strided return type (in case Kokkos::subview is conservative
3830  // about that). Instead, use the row range for the column views
3831  // in the loop.
3832  auto X_lcl = this->getLocalView<Kokkos::HostSpace> ();
3833 
3834  Teuchos::ArrayRCP<Teuchos::ArrayRCP<const Scalar> > views (numCols);
3835  for (size_t j = 0; j < numCols; ++j) {
3836  const size_t col = this->isConstantStride () ? j : this->whichVectors_[j];
3837  auto X_lcl_j = Kokkos::subview (X_lcl, rowRange, col);
3838  Teuchos::ArrayRCP<const impl_scalar_type> X_lcl_j_arcp =
3839  Kokkos::Compat::persistingView (X_lcl_j);
3840  views[j] = Teuchos::arcp_reinterpret_cast<const Scalar> (X_lcl_j_arcp);
3841  }
3842  return views;
3843  }
3844 
3845  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node, const bool classic>
3846  void
3848  multiply (Teuchos::ETransp transA,
3849  Teuchos::ETransp transB,
3850  const Scalar& alpha,
3853  const Scalar& beta)
3854  {
3855  using Teuchos::CONJ_TRANS;
3856  using Teuchos::NO_TRANS;
3857  using Teuchos::TRANS;
3858  using Teuchos::RCP;
3859  using Teuchos::rcp;
3860  using Teuchos::rcpFromRef;
3861  typedef Kokkos::Details::ArithTraits<impl_scalar_type> ATS;
3862  typedef Teuchos::ScalarTraits<Scalar> STS;
3864  const char errPrefix[] = "Tpetra::MultiVector::multiply: ";
3865 
3866  // This routine performs a variety of matrix-matrix multiply
3867  // operations, interpreting the MultiVector (this-aka C , A and B)
3868  // as 2D matrices. Variations are due to the fact that A, B and C
3869  // can be local replicated or global distributed MultiVectors and
3870  // that we may or may not operate with the transpose of A and B.
3871  // Possible cases are:
3872  //
3873  // Operations # Cases Notes
3874  // 1) C(local) = A^X(local) * B^X(local) 4 X=Trans or Not, no comm needed
3875  // 2) C(local) = A^T(distr) * B (distr) 1 2-D dot product, replicate C
3876  // 3) C(distr) = A (distr) * B^X(local) 2 2-D vector update, no comm needed
3877  //
3878  // The following operations are not meaningful for 1-D
3879  // distributions:
3880  //
3881  // u1) C(local) = A^T(distr) * B^T(distr) 1
3882  // u2) C(local) = A (distr) * B^X(distr) 2
3883  // u3) C(distr) = A^X(local) * B^X(local) 4
3884  // u4) C(distr) = A^X(local) * B^X(distr) 4
3885  // u5) C(distr) = A^T(distr) * B^X(local) 2
3886  // u6) C(local) = A^X(distr) * B^X(local) 4
3887  // u7) C(distr) = A^X(distr) * B^X(local) 4
3888  // u8) C(local) = A^X(local) * B^X(distr) 4
3889  //
3890  // Total number of cases: 32 (= 2^5).
3891 
3892  impl_scalar_type beta_local = beta; // local copy of beta; might be reassigned below
3893 
3894  // In a debug build, check compatibility of local dimensions. We
3895  // only do this in a debug build, since we have to do an
3896  // all-reduce to ensure correctness on all processses. It's
3897  // entirely possible that only some processes may have
3898  // incompatible local dimensions. Throwing an exception only on
3899  // those processes could cause this method to hang.
3900 #ifdef HAVE_TPETRA_DEBUG
3901  if (! this->getMap ().is_null () && ! this->getMap ()->getComm ().is_null ()) {
3902  const size_t A_nrows = (transA != NO_TRANS) ? A.getNumVectors() : A.getLocalLength();
3903  const size_t A_ncols = (transA != NO_TRANS) ? A.getLocalLength() : A.getNumVectors();
3904  const size_t B_nrows = (transB != NO_TRANS) ? B.getNumVectors() : B.getLocalLength();
3905  const size_t B_ncols = (transB != NO_TRANS) ? B.getLocalLength() : B.getNumVectors();
3906 
3907  const bool lclBad = this->getLocalLength () != A_nrows ||
3908  this->getNumVectors () != B_ncols || A_ncols != B_nrows;
3909 
3910  const int lclGood = lclBad ? 0 : 1;
3911  int gblGood = 0;
3912 
3913  using Teuchos::REDUCE_MIN;
3914  using Teuchos::reduceAll;
3915  using Teuchos::outArg;
3916 
3917  auto comm = this->getMap ()->getComm (); // not null; see above
3918  reduceAll<int, int> (*comm, REDUCE_MIN, lclGood, outArg (gblGood));
3919 
3920  TEUCHOS_TEST_FOR_EXCEPTION
3921  (gblGood != 1, std::runtime_error, errPrefix << "Local dimensions of "
3922  "*this, op(A), and op(B) are not consistent on at least one process "
3923  "in this object's communicator.");
3924  }
3925 #endif // HAVE_TPETRA_DEBUG
3926 
3927  const bool A_is_local = ! A.isDistributed ();
3928  const bool B_is_local = ! B.isDistributed ();
3929  const bool C_is_local = ! this->isDistributed ();
3930  // Case 1: C(local) = A^X(local) * B^X(local)
3931  const bool Case1 = C_is_local && A_is_local && B_is_local;
3932  // Case 2: C(local) = A^T(distr) * B (distr)
3933  const bool Case2 = C_is_local && ! A_is_local && ! B_is_local &&
3934  transA != NO_TRANS &&
3935  transB == NO_TRANS;
3936  // Case 3: C(distr) = A (distr) * B^X(local)
3937  const bool Case3 = ! C_is_local && ! A_is_local && B_is_local &&
3938  transA == NO_TRANS;
3939 
3940  // Test that we are considering a meaningful case
3941  TEUCHOS_TEST_FOR_EXCEPTION(
3942  ! Case1 && ! Case2 && ! Case3, std::runtime_error, errPrefix
3943  << "Multiplication of op(A) and op(B) into *this is not a "
3944  "supported use case.");
3945 
3946  if (beta != STS::zero () && Case2) {
3947  // If Case2, then C is local and contributions must be summed
3948  // across all processes. However, if beta != 0, then accumulate
3949  // beta*C into the sum. When summing across all processes, we
3950  // only want to accumulate this once, so set beta == 0 on all
3951  // processes except Process 0.
3952  const int myRank = this->getMap ()->getComm ()->getRank ();
3953  if (myRank != 0) {
3954  beta_local = ATS::zero ();
3955  }
3956  }
3957 
3958  // We only know how to do matrix-matrix multiplies if all the
3959  // MultiVectors have constant stride. If not, we have to make
3960  // temporary copies of those MultiVectors (including possibly
3961  // *this) that don't have constant stride.
3962  RCP<MV> C_tmp;
3963  if (! isConstantStride ()) {
3964  C_tmp = rcp (new MV (*this, Teuchos::Copy)); // deep copy
3965  } else {
3966  C_tmp = rcp (this, false);
3967  }
3968 
3969  RCP<const MV> A_tmp;
3970  if (! A.isConstantStride ()) {
3971  A_tmp = rcp (new MV (A, Teuchos::Copy)); // deep copy
3972  } else {
3973  A_tmp = rcpFromRef (A);
3974  }
3975 
3976  RCP<const MV> B_tmp;
3977  if (! B.isConstantStride ()) {
3978  B_tmp = rcp (new MV (B, Teuchos::Copy)); // deep copy
3979  } else {
3980  B_tmp = rcpFromRef (B);
3981  }
3982 
3983  TEUCHOS_TEST_FOR_EXCEPTION(
3984  ! C_tmp->isConstantStride () || ! B_tmp->isConstantStride () ||
3985  ! A_tmp->isConstantStride (), std::logic_error, errPrefix
3986  << "Failed to make temporary constant-stride copies of MultiVectors.");
3987 
3989 
3990  {
3991  const size_t A_lclNumRows = A_tmp->getLocalLength ();
3992  const size_t A_numVecs = A_tmp->getNumVectors ();
3993  auto A_lcl = A_tmp->template getLocalView<device_type> ();
3994  auto A_sub = Kokkos::subview (A_lcl,
3995  std::make_pair (size_t (0), A_lclNumRows),
3996  std::make_pair (size_t (0), A_numVecs));
3997  const size_t B_lclNumRows = B_tmp->getLocalLength ();
3998  const size_t B_numVecs = B_tmp->getNumVectors ();
3999  auto B_lcl = B_tmp->template getLocalView<device_type> ();
4000  auto B_sub = Kokkos::subview (B_lcl,
4001  std::make_pair (size_t (0), B_lclNumRows),
4002  std::make_pair (size_t (0), B_numVecs));
4003  const size_t C_lclNumRows = C_tmp->getLocalLength ();
4004  const size_t C_numVecs = C_tmp->getNumVectors ();
4005  auto C_lcl = C_tmp->template getLocalView<device_type> ();
4006  auto C_sub = Kokkos::subview (C_lcl,
4007  std::make_pair (size_t (0), C_lclNumRows),
4008  std::make_pair (size_t (0), C_numVecs));
4009  gemm_type::GEMM (transA, transB, alpha, A_sub, B_sub, beta_local, C_sub);
4010  }
4011 
4012  if (! isConstantStride ()) {
4013  deep_copy (*this, *C_tmp); // Copy the result back into *this.
4014  }
4015 
4016  // Dispose of (possibly) extra copies of A and B.
4017  A_tmp = Teuchos::null;
4018  B_tmp = Teuchos::null;
4019 
4020  // If Case 2 then sum up *this and distribute it to all processes.
4021  if (Case2) {
4022  this->reduce ();
4023  }
4024  }
4025 
4026  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node, const bool classic>
4027  void
4029  elementWiseMultiply (Scalar scalarAB,
4032  Scalar scalarThis)
4033  {
4034  using Kokkos::ALL;
4035  using Kokkos::subview;
4038  const char tfecfFuncName[] = "elementWiseMultiply: ";
4039 
4040  const size_t lclNumRows = this->getLocalLength ();
4041  const size_t numVecs = this->getNumVectors ();
4042 
4043  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
4044  (lclNumRows != A.getLocalLength () || lclNumRows != B.getLocalLength (),
4045  std::runtime_error, "MultiVectors do not have the same local length.");
4046  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
4047  numVecs != B.getNumVectors (), std::runtime_error, "this->getNumVectors"
4048  "() = " << numVecs << " != B.getNumVectors() = " << B.getNumVectors ()
4049  << ".");
4050 
4051  // FIXME (mfh 18 May 2016) It would be rude to sync A and B here,
4052  // because they are guests of this method. Instead, the polite
4053  // thing to do would be to copy them (if necessary) so we get
4054  // their most recently updated version. *this should always
4055  // control where execution happens.
4056 
4057  this->template sync<device_type> ();
4058  this->template modify<device_type> ();
4059  const_cast<V&> (A).template sync<device_type> ();
4060  const_cast<MV&> (B).template sync<device_type> ();
4061  auto this_view = this->template getLocalView<device_type> ();
4062  auto A_view = A.template getLocalView<device_type> ();
4063  auto B_view = B.template getLocalView<device_type> ();
4064 
4065  if (isConstantStride () && B.isConstantStride ()) {
4066  // A is just a Vector; it only has one column, so it always has
4067  // constant stride.
4068  //
4069  // If both *this and B have constant stride, we can do an
4070  // element-wise multiply on all columns at once.
4071  KokkosBlas::mult (scalarThis,
4072  this_view,
4073  scalarAB,
4074  subview (A_view, ALL (), 0),
4075  B_view);
4076  }
4077  else {
4078  for (size_t j = 0; j < numVecs; ++j) {
4079  const size_t C_col = isConstantStride () ? j : whichVectors_[j];
4080  const size_t B_col = B.isConstantStride () ? j : B.whichVectors_[j];
4081  KokkosBlas::mult (scalarThis,
4082  subview (this_view, ALL (), C_col),
4083  scalarAB,
4084  subview (A_view, ALL (), 0),
4085  subview (B_view, ALL (), B_col));
4086  }
4087  }
4088  }
4089 
4090  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node, const bool classic>
4091  void
4094  {
4095  using Teuchos::reduceAll;
4096  using Teuchos::REDUCE_SUM;
4097  typedef decltype (this->template getLocalView<device_type> ())
4098  dev_view_type;
4099  typedef decltype (this->template getLocalView<Kokkos::HostSpace> ())
4100  host_view_type;
4101 
4102  TEUCHOS_TEST_FOR_EXCEPTION
4103  (this->isDistributed (), std::runtime_error,
4104  "Tpetra::MultiVector::reduce should only be called with locally "
4105  "replicated or otherwise not distributed MultiVector objects.");
4106  const Teuchos::Comm<int>& comm = * (this->getMap ()->getComm ());
4107  if (comm.getSize () == 1) {
4108  return; // MultiVector is already "reduced" to one process
4109  }
4110 
4111  const size_t lclNumRows = getLocalLength ();
4112  const size_t numCols = getNumVectors ();
4113  const size_t totalAllocSize = lclNumRows * numCols;
4114 
4115  // FIXME (mfh 16 June 2014) This exception will cause deadlock if
4116  // it triggers on only some processes. We don't have a good way
4117  // to pack this result into the all-reduce below, but this would
4118  // be a good reason to set a "local error flag" and find other
4119  // opportunities to let it propagate.
4120  TEUCHOS_TEST_FOR_EXCEPTION(
4121  lclNumRows > static_cast<size_t> (std::numeric_limits<int>::max ()),
4122  std::runtime_error, "Tpetra::MultiVector::reduce: On Process " <<
4123  comm.getRank () << ", the number of local rows " << lclNumRows <<
4124  " does not fit in int.");
4125 
4126  //
4127  // Use MPI to sum the entries across all local blocks.
4128  //
4129 
4130  // Get the most recently updated version of the data.
4131  //
4132  // If we need sync to device, then host has the most recent version.
4133  const bool useHostVersion = this->template need_sync<device_type> ();
4134  // Only one of these (the most recently updated one) is active.
4135  dev_view_type srcView_dev;
4136  host_view_type srcView_host;
4137  if (useHostVersion) {
4138  srcView_host = this->template getLocalView<Kokkos::HostSpace> ();
4139  if (lclNumRows != static_cast<size_t> (srcView_host.dimension_0 ())) {
4140  // Make sure the number of rows is correct. If not, take a subview.
4141  const Kokkos::pair<size_t, size_t> rowRng (0, lclNumRows);
4142  srcView_host = Kokkos::subview (srcView_host, rowRng, Kokkos::ALL ());
4143  }
4144  }
4145  else {
4146  srcView_dev = this->template getLocalView<device_type> ();
4147  if (lclNumRows != static_cast<size_t> (srcView_dev.dimension_0 ())) {
4148  // Make sure the number of rows is correct. If not, take a subview.
4149  const Kokkos::pair<size_t, size_t> rowRng (0, lclNumRows);
4150  srcView_dev = Kokkos::subview (srcView_dev, rowRng, Kokkos::ALL ());
4151  }
4152  }
4153 
4154  // If this MultiVector's local data are stored contiguously, we
4155  // can use the local View as the source buffer in the
4156  // MPI_Allreduce. Otherwise, we have to allocate a temporary
4157  // source buffer and pack.
4158  const bool contig = isConstantStride () && getStride () == lclNumRows;
4159  dev_view_type srcBuf_dev;
4160  host_view_type srcBuf_host;
4161  if (useHostVersion) {
4162  if (contig) {
4163  srcBuf_host = srcView_host; // use the View as-is
4164  }
4165  else { // need to repack into a contiguous buffer
4166  srcBuf_host = decltype (srcBuf_host) ("srcBuf", lclNumRows, numCols);
4167  Kokkos::deep_copy (srcBuf_host, srcView_host);
4168  }
4169  }
4170  else { // use device version
4171  if (contig) {
4172  srcBuf_dev = srcView_dev; // use the View as-is
4173  }
4174  else { // need to repack into a contiguous buffer
4175  srcBuf_dev = decltype (srcBuf_dev) ("srcBuf", lclNumRows, numCols);
4176  Kokkos::deep_copy (srcBuf_dev, srcView_dev);
4177  }
4178  }
4179 
4180  // Check expected invariant of the above block of code. At this
4181  // point, either the srcBuf of choice points to the srcView of
4182  // choice, or it has the right allocation size.
4183  {
4184  // Use >=, not ==, because if srcBuf just points to srcView,
4185  // then srcView may actually be bigger than what we need.
4186  const bool correct =
4187  (useHostVersion && static_cast<size_t> (srcBuf_host.size ()) >= totalAllocSize) ||
4188  (! useHostVersion && static_cast<size_t> (srcBuf_dev.size ()) >= totalAllocSize);
4189  TEUCHOS_TEST_FOR_EXCEPTION
4190  (! correct, std::logic_error, "Tpetra::MultiVector::reduce: Violated "
4191  "invariant of temporary source buffer construction. Please report "
4192  "this bug to the Tpetra developers.");
4193  }
4194 
4195  // MPI requires that the send and receive buffers don't alias one
4196  // another, so we have to copy temporary storage for the result.
4197  //
4198  // We expect that MPI implementations will know how to read device
4199  // pointers.
4200  dev_view_type tgtBuf_dev;
4201  host_view_type tgtBuf_host;
4202  if (useHostVersion) {
4203  tgtBuf_host = host_view_type ("tgtBuf", lclNumRows, numCols);
4204  }
4205  else {
4206  tgtBuf_dev = dev_view_type ("tgtBuf", lclNumRows, numCols);
4207  }
4208 
4209  const int reduceCount = static_cast<int> (totalAllocSize);
4210  if (useHostVersion) {
4211  TEUCHOS_TEST_FOR_EXCEPTION
4212  (static_cast<size_t> (tgtBuf_host.size ()) < totalAllocSize,
4213  std::logic_error, "Tpetra::MultiVector::reduce: tgtBuf_host.size() = "
4214  << tgtBuf_host.size () << " < lclNumRows*numCols = " << totalAllocSize
4215  << ". Please report this bug to the Tpetra developers.");
4216  reduceAll<int, impl_scalar_type> (comm, REDUCE_SUM, reduceCount,
4217  srcBuf_host.ptr_on_device (),
4218  tgtBuf_host.ptr_on_device ());
4219  }
4220  else { // use device version
4221  TEUCHOS_TEST_FOR_EXCEPTION
4222  (static_cast<size_t> (tgtBuf_dev.size ()) < totalAllocSize,
4223  std::logic_error, "Tpetra::MultiVector::reduce: tgtBuf_dev.size() = "
4224  << tgtBuf_dev.size () << " < lclNumRows*numCols = " << totalAllocSize
4225  << ". Please report this bug to the Tpetra developers.");
4226  reduceAll<int, impl_scalar_type> (comm, REDUCE_SUM, reduceCount,
4227  srcBuf_dev.ptr_on_device (),
4228  tgtBuf_dev.ptr_on_device ());
4229  }
4230 
4231  // Write back the results to *this.
4232  if (useHostVersion) {
4233  this->template modify<Kokkos::HostSpace> ();
4234  if (contig || isConstantStride ()) {
4235  Kokkos::deep_copy (srcView_host, tgtBuf_host);
4236  }
4237  else { // need to copy one column at a time
4238  for (size_t j = 0; j < numCols; ++j) {
4239  auto X_j_out = Kokkos::subview (srcView_host, Kokkos::ALL (), j);
4240  auto X_j_in = Kokkos::subview (tgtBuf_host, Kokkos::ALL (), j);
4241  Kokkos::deep_copy (X_j_out, X_j_in);
4242  }
4243  }
4244  }
4245  else { // use device version
4246  this->template modify<device_type> ();
4247  if (contig || isConstantStride ()) {
4248  Kokkos::deep_copy (srcView_dev, tgtBuf_dev);
4249  }
4250  else { // need to copy one column at a time
4251  for (size_t j = 0; j < numCols; ++j) {
4252  auto X_j_out = Kokkos::subview (srcView_dev, Kokkos::ALL (), j);
4253  auto X_j_in = Kokkos::subview (tgtBuf_dev, Kokkos::ALL (), j);
4254  Kokkos::deep_copy (X_j_out, X_j_in);
4255  }
4256  }
4257  }
4258 
4259  // We leave *this unsynchronized.
4260  }
4261 
4262 
4263  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node, const bool classic>
4264  void
4266  replaceLocalValue (const LocalOrdinal lclRow,
4267  const size_t col,
4268  const impl_scalar_type& ScalarValue) const
4269  {
4270 #ifdef HAVE_TPETRA_DEBUG
4271  const LocalOrdinal minLocalIndex = this->getMap()->getMinLocalIndex();
4272  const LocalOrdinal maxLocalIndex = this->getMap()->getMaxLocalIndex();
4273  TEUCHOS_TEST_FOR_EXCEPTION(
4274  lclRow < minLocalIndex || lclRow > maxLocalIndex,
4275  std::runtime_error,
4276  "Tpetra::MultiVector::replaceLocalValue: row index " << lclRow
4277  << " is invalid. The range of valid row indices on this process "
4278  << this->getMap()->getComm()->getRank() << " is [" << minLocalIndex
4279  << ", " << maxLocalIndex << "].");
4280  TEUCHOS_TEST_FOR_EXCEPTION(
4281  vectorIndexOutOfRange(col),
4282  std::runtime_error,
4283  "Tpetra::MultiVector::replaceLocalValue: vector index " << col
4284  << " of the multivector is invalid.");
4285 #endif
4286  const size_t colInd = isConstantStride () ? col : whichVectors_[col];
4287  view_.h_view (lclRow, colInd) = ScalarValue;
4288  }
4289 
4290 
4291  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node, const bool classic>
4292  void
4294  sumIntoLocalValue (const LocalOrdinal lclRow,
4295  const size_t col,
4296  const impl_scalar_type& value,
4297  const bool atomic) const
4298  {
4299 #ifdef HAVE_TPETRA_DEBUG
4300  const LocalOrdinal minLocalIndex = this->getMap()->getMinLocalIndex();
4301  const LocalOrdinal maxLocalIndex = this->getMap()->getMaxLocalIndex();
4302  TEUCHOS_TEST_FOR_EXCEPTION(
4303  lclRow < minLocalIndex || lclRow > maxLocalIndex,
4304  std::runtime_error,
4305  "Tpetra::MultiVector::sumIntoLocalValue: row index " << lclRow
4306  << " is invalid. The range of valid row indices on this process "
4307  << this->getMap()->getComm()->getRank() << " is [" << minLocalIndex
4308  << ", " << maxLocalIndex << "].");
4309  TEUCHOS_TEST_FOR_EXCEPTION(
4310  vectorIndexOutOfRange(col),
4311  std::runtime_error,
4312  "Tpetra::MultiVector::sumIntoLocalValue: vector index " << col
4313  << " of the multivector is invalid.");
4314 #endif
4315  const size_t colInd = isConstantStride () ? col : whichVectors_[col];
4316  if (atomic) {
4317  Kokkos::atomic_add (& (view_.h_view(lclRow, colInd)), value);
4318  }
4319  else {
4320  view_.h_view (lclRow, colInd) += value;
4321  }
4322  }
4323 
4324 
4325  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node, const bool classic>
4326  void
4328  replaceGlobalValue (const GlobalOrdinal gblRow,
4329  const size_t col,
4330  const impl_scalar_type& ScalarValue) const
4331  {
4332  // mfh 23 Nov 2015: Use map_ and not getMap(), because the latter
4333  // touches the RCP's reference count, which isn't thread safe.
4334  const LocalOrdinal MyRow = this->map_->getLocalElement (gblRow);
4335 #ifdef HAVE_TPETRA_DEBUG
4336  TEUCHOS_TEST_FOR_EXCEPTION(
4337  MyRow == Teuchos::OrdinalTraits<LocalOrdinal>::invalid (),
4338  std::runtime_error,
4339  "Tpetra::MultiVector::replaceGlobalValue: Global row index " << gblRow
4340  << "is not present on this process "
4341  << this->getMap ()->getComm ()->getRank () << ".");
4342  TEUCHOS_TEST_FOR_EXCEPTION(
4343  vectorIndexOutOfRange (col), std::runtime_error,
4344  "Tpetra::MultiVector::replaceGlobalValue: Vector index " << col
4345  << " of the multivector is invalid.");
4346 #endif // HAVE_TPETRA_DEBUG
4347  this->replaceLocalValue (MyRow, col, ScalarValue);
4348  }
4349 
4350  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node, const bool classic>
4351  void
4353  sumIntoGlobalValue (const GlobalOrdinal globalRow,
4354  const size_t col,
4355  const impl_scalar_type& value,
4356  const bool atomic) const
4357  {
4358  // mfh 23 Nov 2015: Use map_ and not getMap(), because the latter
4359  // touches the RCP's reference count, which isn't thread safe.
4360  const LocalOrdinal lclRow = this->map_->getLocalElement (globalRow);
4361 #ifdef HAVE_TEUCHOS_DEBUG
4362  TEUCHOS_TEST_FOR_EXCEPTION(
4363  lclRow == Teuchos::OrdinalTraits<LocalOrdinal>::invalid (),
4364  std::runtime_error,
4365  "Tpetra::MultiVector::sumIntoGlobalValue: Global row index " << globalRow
4366  << "is not present on this process "
4367  << this->getMap ()->getComm ()->getRank () << ".");
4368  TEUCHOS_TEST_FOR_EXCEPTION(
4369  vectorIndexOutOfRange(col),
4370  std::runtime_error,
4371  "Tpetra::MultiVector::sumIntoGlobalValue: Vector index " << col
4372  << " of the multivector is invalid.");
4373 #endif
4374  this->sumIntoLocalValue (lclRow, col, value, atomic);
4375  }
4376 
4377  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node, const bool classic>
4378  template <class T>
4379  Teuchos::ArrayRCP<T>
4381  getSubArrayRCP (Teuchos::ArrayRCP<T> arr,
4382  size_t j) const
4383  {
4384  typedef Kokkos::DualView<impl_scalar_type*,
4385  typename dual_view_type::array_layout,
4386  execution_space> col_dual_view_type;
4387  const size_t col = isConstantStride () ? j : whichVectors_[j];
4388  col_dual_view_type X_col =
4389  Kokkos::subview (view_, Kokkos::ALL (), col);
4390  return Kokkos::Compat::persistingView (X_col.d_view);
4391  }
4392 
4393  template <class Scalar,
4394  class LocalOrdinal,
4395  class GlobalOrdinal,
4396  class Node,
4397  const bool classic>
4400  getDualView () const {
4401  return view_;
4402  }
4403 
4404  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node, const bool classic>
4405  std::string
4407  descriptionImpl (const std::string& className) const
4408  {
4409  using Teuchos::TypeNameTraits;
4410 
4411  std::ostringstream out;
4412  out << "\"" << className << "\": {";
4413  out << "Template parameters: {Scalar: " << TypeNameTraits<Scalar>::name ()
4414  << ", LocalOrdinal: " << TypeNameTraits<LocalOrdinal>::name ()
4415  << ", GlobalOrdinal: " << TypeNameTraits<GlobalOrdinal>::name ()
4416  << ", Node" << Node::name ()
4417  << "}, ";
4418  if (this->getObjectLabel () != "") {
4419  out << "Label: \"" << this->getObjectLabel () << "\", ";
4420  }
4421  out << ", numRows: " << this->getGlobalLength ();
4422  if (className != "Tpetra::Vector") {
4423  out << ", numCols: " << this->getNumVectors ()
4424  << ", isConstantStride: " << this->isConstantStride ();
4425  }
4426  if (this->isConstantStride ()) {
4427  out << ", columnStride: " << this->getStride ();
4428  }
4429  out << "}";
4430 
4431  return out.str ();
4432  }
4433 
4434  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node, const bool classic>
4435  std::string
4438  {
4439  return this->descriptionImpl ("Tpetra::MultiVector");
4440  }
4441 
4442  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node, const bool classic>
4443  std::string
4445  localDescribeToString (const Teuchos::EVerbosityLevel vl) const
4446  {
4447  typedef LocalOrdinal LO;
4448  using std::endl;
4449 
4450  if (vl <= Teuchos::VERB_LOW) {
4451  return std::string ();
4452  }
4453  auto map = this->getMap ();
4454  if (map.is_null ()) {
4455  return std::string ();
4456  }
4457  auto outStringP = Teuchos::rcp (new std::ostringstream ());
4458  auto outp = Teuchos::getFancyOStream (outStringP);
4459  Teuchos::FancyOStream& out = *outp;
4460  auto comm = map->getComm ();
4461  const int myRank = comm->getRank ();
4462  const int numProcs = comm->getSize ();
4463 
4464  out << "Process " << myRank << " of " << numProcs << ":" << endl;
4465  Teuchos::OSTab tab1 (out);
4466 
4467  // VERB_MEDIUM and higher prints getLocalLength()
4468  const LO lclNumRows = static_cast<LO> (this->getLocalLength ());
4469  out << "Local number of rows: " << lclNumRows << endl;
4470 
4471  if (vl > Teuchos::VERB_MEDIUM) {
4472  // VERB_HIGH and higher prints isConstantStride() and getStride().
4473  // The first is only relevant if the Vector has multiple columns.
4474  if (this->getNumVectors () != static_cast<size_t> (1)) {
4475  out << "isConstantStride: " << this->isConstantStride () << endl;
4476  }
4477  if (this->isConstantStride ()) {
4478  out << "Column stride: " << this->getStride () << endl;
4479  }
4480 
4481  if (vl > Teuchos::VERB_HIGH && lclNumRows > 0) {
4482  // VERB_EXTREME prints values. Get a host View of the
4483  // Vector's local data, so we can print it. (Don't assume
4484  // that we can access device data directly in host code.)
4485  auto X_dual = this->getDualView ();
4486  typename dual_view_type::t_host X_host;
4487  if (X_dual.template need_sync<Kokkos::HostSpace> ()) {
4488  // Device memory has the latest version. Don't actually
4489  // sync to host; that changes the Vector's state, and may
4490  // change future computations (that use the data's current
4491  // place to decide where to compute). Instead, create a
4492  // temporary host copy and print that.
4493  auto X_dev = this->template getLocalView<device_type> ();
4494  auto X_host_copy = Kokkos::create_mirror_view (X_dev);
4495  Kokkos::deep_copy (X_host_copy, X_dev);
4496  X_host = X_host_copy;
4497  }
4498  else {
4499  // Either host and device are in sync, or host has the
4500  // latest version of the Vector's data. Thus, we can use
4501  // the host version directly.
4502  X_host = this->template getLocalView<Kokkos::HostSpace> ();
4503  }
4504  // The square braces [] and their contents are in Matlab
4505  // format, so users may copy and paste directly into Matlab.
4506  out << "Values: " << endl
4507  << "[";
4508  const LO numCols = static_cast<LO> (this->getNumVectors ());
4509  if (numCols == 1) {
4510  for (LO i = 0; i < lclNumRows; ++i) {
4511  out << X_host(i,0);
4512  if (i + 1 < lclNumRows) {
4513  out << "; ";
4514  }
4515  }
4516  }
4517  else {
4518  for (LO i = 0; i < lclNumRows; ++i) {
4519  for (LO j = 0; j < numCols; ++j) {
4520  out << X_host(i,j);
4521  if (j + 1 < numCols) {
4522  out << ", ";
4523  }
4524  }
4525  if (i + 1 < lclNumRows) {
4526  out << "; ";
4527  }
4528  }
4529  }
4530  out << "]" << endl;
4531  }
4532  }
4533 
4534  out.flush (); // make sure the ostringstream got everything
4535  return outStringP->str ();
4536  }
4537 
4538  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node, const bool classic>
4539  void
4541  describeImpl (Teuchos::FancyOStream& out,
4542  const std::string& className,
4543  const Teuchos::EVerbosityLevel verbLevel) const
4544  {
4545  using Teuchos::TypeNameTraits;
4546  using Teuchos::VERB_DEFAULT;
4547  using Teuchos::VERB_NONE;
4548  using Teuchos::VERB_LOW;
4549  using std::endl;
4550  const Teuchos::EVerbosityLevel vl =
4551  (verbLevel == VERB_DEFAULT) ? VERB_LOW : verbLevel;
4552 
4553  if (vl == VERB_NONE) {
4554  return; // don't print anything
4555  }
4556  // If this Vector's Comm is null, then the Vector does not
4557  // participate in collective operations with the other processes.
4558  // In that case, it is not even legal to call this method. The
4559  // reasonable thing to do in that case is nothing.
4560  auto map = this->getMap ();
4561  if (map.is_null ()) {
4562  return;
4563  }
4564  auto comm = map->getComm ();
4565  if (comm.is_null ()) {
4566  return;
4567  }
4568 
4569  const int myRank = comm->getRank ();
4570 
4571  // Only Process 0 should touch the output stream, but this method
4572  // in general may need to do communication. Thus, we may need to
4573  // preserve the current tab level across multiple "if (myRank ==
4574  // 0) { ... }" inner scopes. This is why we sometimes create
4575  // OSTab instances by pointer, instead of by value. We only need
4576  // to create them by pointer if the tab level must persist through
4577  // multiple inner scopes.
4578  Teuchos::RCP<Teuchos::OSTab> tab0, tab1;
4579 
4580  // VERB_LOW and higher prints the equivalent of description().
4581  if (myRank == 0) {
4582  tab0 = Teuchos::rcp (new Teuchos::OSTab (out));
4583  out << "\"" << className << "\":" << endl;
4584  tab1 = Teuchos::rcp (new Teuchos::OSTab (out));
4585  {
4586  out << "Template parameters:" << endl;
4587  Teuchos::OSTab tab2 (out);
4588  out << "Scalar: " << TypeNameTraits<Scalar>::name () << endl
4589  << "LocalOrdinal: " << TypeNameTraits<LocalOrdinal>::name () << endl
4590  << "GlobalOrdinal: " << TypeNameTraits<GlobalOrdinal>::name () << endl
4591  << "Node: " << Node::name () << endl;
4592  }
4593  if (this->getObjectLabel () != "") {
4594  out << "Label: \"" << this->getObjectLabel () << "\", ";
4595  }
4596  out << "Global number of rows: " << this->getGlobalLength () << endl;
4597  if (className != "Tpetra::Vector") {
4598  out << "Number of columns: " << this->getNumVectors () << endl;
4599  }
4600  // getStride() may differ on different processes, so it (and
4601  // isConstantStride()) properly belong to per-process data.
4602  }
4603 
4604  // This is collective over the Map's communicator.
4605  if (vl > VERB_LOW) { // VERB_MEDIUM, VERB_HIGH, or VERB_EXTREME
4606  const std::string lclStr = this->localDescribeToString (vl);
4607  Tpetra::Details::gathervPrint (out, lclStr, *comm);
4608  }
4609  }
4610 
4611  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node, const bool classic>
4612  void
4614  describe (Teuchos::FancyOStream &out,
4615  const Teuchos::EVerbosityLevel verbLevel) const
4616  {
4617  this->describeImpl (out, "Tpetra::MultiVector", verbLevel);
4618  }
4619 
4620  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node, const bool classic>
4621  void
4623  removeEmptyProcessesInPlace (const Teuchos::RCP<const map_type>& newMap)
4624  {
4625  replaceMap (newMap);
4626  }
4627 
4628  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node, const bool classic>
4629  void
4632  {
4633  typedef LocalOrdinal LO;
4634  typedef device_type DT;
4635  typedef typename dual_view_type::host_mirror_space::device_type HMDT;
4636  const bool debug = false;
4637 
4638  TEUCHOS_TEST_FOR_EXCEPTION(
4639  this->getGlobalLength () != src.getGlobalLength () ||
4640  this->getNumVectors () != src.getNumVectors (), std::invalid_argument,
4641  "Tpetra::deep_copy: Global dimensions of the two Tpetra::MultiVector "
4642  "objects do not match. src has dimensions [" << src.getGlobalLength ()
4643  << "," << src.getNumVectors () << "], and *this has dimensions ["
4644  << this->getGlobalLength () << "," << this->getNumVectors () << "].");
4645  // FIXME (mfh 28 Jul 2014) Don't throw; just set a local error flag.
4646  TEUCHOS_TEST_FOR_EXCEPTION(
4647  this->getLocalLength () != src.getLocalLength (), std::invalid_argument,
4648  "Tpetra::deep_copy: The local row counts of the two Tpetra::MultiVector "
4649  "objects do not match. src has " << src.getLocalLength () << " row(s) "
4650  << " and *this has " << this->getLocalLength () << " row(s).");
4651 
4652  if (debug && this->getMap ()->getComm ()->getRank () == 0) {
4653  std::cout << "*** MultiVector::assign: ";
4654  }
4655 
4656  if (src.isConstantStride () && this->isConstantStride ()) {
4657  if (debug && this->getMap ()->getComm ()->getRank () == 0) {
4658  std::cout << "Both *this and src have constant stride" << std::endl;
4659  }
4660 
4661  // If we need sync to device, then host has the most recent version.
4662  const bool useHostVersion = src.template need_sync<device_type> ();
4663 
4664  if (useHostVersion) { // Host memory has the most recent version of src.
4665  this->template modify<HMDT> (); // We are about to modify dst on host.
4666  // Copy from src to dst on host.
4667  Details::localDeepCopyConstStride (this->template getLocalView<HMDT> (),
4668  src.template getLocalView<HMDT> ());
4669  this->template sync<DT> (); // Sync dst from host to device.
4670  }
4671  else { // Device memory has the most recent version of src.
4672  this->template modify<DT> (); // We are about to modify dst on device.
4673  // Copy from src to dst on device.
4674  Details::localDeepCopyConstStride (this->template getLocalView<DT> (),
4675  src.template getLocalView<DT> ());
4676  this->template sync<HMDT> (); // Sync dst from device to host.
4677  }
4678  }
4679  else {
4680  if (this->isConstantStride ()) {
4681  if (debug && this->getMap ()->getComm ()->getRank () == 0) {
4682  std::cout << "Only *this has constant stride";
4683  }
4684 
4685  const LO numWhichVecs = static_cast<LO> (src.whichVectors_.size ());
4686  const std::string whichVecsLabel ("MV::deep_copy::whichVecs");
4687 
4688  // We can't sync src, since it is only an input argument.
4689  // Thus, we have to use the most recently modified version of
4690  // src, device or host.
4691  //
4692  // If we need sync to device, then host has the most recent version.
4693  const bool useHostVersion = src.template need_sync<device_type> ();
4694  if (useHostVersion) { // host version of src most recently modified
4695  if (debug && this->getMap ()->getComm ()->getRank () == 0) {
4696  std::cout << "; Copy from host version of src" << std::endl;
4697  }
4698  // Copy from the host version of src.
4699  //
4700  // whichVecs tells the kernel which vectors (columns) of src
4701  // to copy. Fill whichVecs on the host, and use it there.
4702  typedef Kokkos::View<LO*, HMDT> whichvecs_type;
4703  whichvecs_type srcWhichVecs (whichVecsLabel, numWhichVecs);
4704  for (LO i = 0; i < numWhichVecs; ++i) {
4705  srcWhichVecs(i) = static_cast<LO> (src.whichVectors_[i]);
4706  }
4707  // Copy from the selected vectors of src to dst, on the
4708  // host. The function ignores its dstWhichVecs argument in
4709  // this case.
4710  Details::localDeepCopy (this->template getLocalView<HMDT> (),
4711  src.template getLocalView<HMDT> (),
4712  true, false, srcWhichVecs, srcWhichVecs);
4713  // Sync dst back to the device, since we only copied on the host.
4714  this->template sync<DT> ();
4715  }
4716  else { // copy from the device version of src
4717  if (debug && this->getMap ()->getComm ()->getRank () == 0) {
4718  std::cout << "; Copy from device version of src" << std::endl;
4719  }
4720  // Copy from the device version of src.
4721  //
4722  // whichVecs tells the kernel which vectors (columns) of src
4723  // to copy. Fill whichVecs on the host, and sync to device.
4724  typedef Kokkos::DualView<LO*, DT> whichvecs_type;
4725  whichvecs_type srcWhichVecs (whichVecsLabel, numWhichVecs);
4726  srcWhichVecs.template modify<HMDT> ();
4727  for (LO i = 0; i < numWhichVecs; ++i) {
4728  srcWhichVecs.h_view(i) = static_cast<LO> (src.whichVectors_[i]);
4729  }
4730  // Sync the host version of srcWhichVecs to the device.
4731  srcWhichVecs.template sync<DT> ();
4732 
4733  // Mark the device version of dst's DualView as modified.
4734  this->template modify<DT> ();
4735 
4736  // Copy from the selected vectors of src to dst, on the
4737  // device. The function ignores its dstWhichVecs argument
4738  // in this case.
4739  Details::localDeepCopy (this->template getLocalView<DT> (),
4740  src.template getLocalView<DT> (),
4741  true, false, srcWhichVecs.d_view,
4742  srcWhichVecs.d_view);
4743  // Sync *this' DualView to the host. This is cheaper than
4744  // repeating the above copy from src to *this on the host.
4745  this->template sync<HMDT> ();
4746  }
4747 
4748  }
4749  else { // dst is NOT constant stride
4750  if (src.isConstantStride ()) {
4751  if (debug && this->getMap ()->getComm ()->getRank () == 0) {
4752  std::cout << "Only src has constant stride" << std::endl;
4753  }
4754 
4755  // If we need sync to device, then host has the most recent version.
4756  const bool useHostVersion = src.template need_sync<device_type> ();
4757  if (useHostVersion) { // src most recently modified on host
4758  // Copy from the host version of src.
4759  //
4760  // whichVecs tells the kernel which vectors (columns) of src
4761  // to copy. Fill whichVecs on the host, and use it there.
4762  typedef Kokkos::View<LO*, HMDT> whichvecs_type;
4763  const LO numWhichVecs = static_cast<LO> (this->whichVectors_.size ());
4764  whichvecs_type whichVecs ("MV::deep_copy::whichVecs", numWhichVecs);
4765  for (LO i = 0; i < numWhichVecs; ++i) {
4766  whichVecs(i) = static_cast<LO> (this->whichVectors_[i]);
4767  }
4768  // Copy from src to the selected vectors of dst, on the
4769  // host. The functor ignores its 4th arg in this case.
4770  Details::localDeepCopy (this->template getLocalView<HMDT> (),
4771  src.template getLocalView<HMDT> (),
4772  this->isConstantStride (),
4773  src.isConstantStride (),
4774  whichVecs, whichVecs);
4775  // Sync dst back to the device, since we only copied on the host.
4776  //
4777  // FIXME (mfh 29 Jul 2014) This may overwrite columns that
4778  // don't actually belong to dst's view.
4779  this->template sync<DT> ();
4780  }
4781  else { // Copy from the device version of src.
4782  // whichVecs tells the kernel which vectors (columns) of dst
4783  // to copy. Fill whichVecs on the host, and sync to device.
4784  typedef Kokkos::DualView<LO*, DT> whichvecs_type;
4785  const std::string whichVecsLabel ("MV::deep_copy::whichVecs");
4786  const LO numWhichVecs = static_cast<LO> (this->whichVectors_.size ());
4787  whichvecs_type whichVecs (whichVecsLabel, numWhichVecs);
4788  whichVecs.template modify<HMDT> ();
4789  for (LO i = 0; i < numWhichVecs; ++i) {
4790  whichVecs.h_view(i) = this->whichVectors_[i];
4791  }
4792  // Sync the host version of whichVecs to the device.
4793  whichVecs.template sync<DT> ();
4794 
4795  // Copy src to the selected vectors of dst, on the device.
4796  Details::localDeepCopy (this->template getLocalView<DT> (),
4797  src.template getLocalView<DT> (),
4798  this->isConstantStride (),
4799  src.isConstantStride (),
4800  whichVecs.d_view, whichVecs.d_view);
4801  // We can't sync src and repeat the above copy on the
4802  // host, so sync dst back to the host.
4803  //
4804  // FIXME (mfh 29 Jul 2014) This may overwrite columns that
4805  // don't actually belong to dst's view.
4806  this->template sync<HMDT> ();
4807  }
4808  }
4809  else { // neither src nor dst have constant stride
4810  if (debug && this->getMap ()->getComm ()->getRank () == 0) {
4811  std::cout << "Neither *this nor src has constant stride" << std::endl;
4812  }
4813 
4814  // If we need sync to device, then host has the most recent version.
4815  const bool useHostVersion = src.template need_sync<device_type> ();
4816  if (useHostVersion) { // copy from the host version of src
4817  const LO dstNumWhichVecs = static_cast<LO> (this->whichVectors_.size ());
4818  Kokkos::View<LO*, HMDT> whichVectorsDst ("dstWhichVecs", dstNumWhichVecs);
4819  for (LO i = 0; i < dstNumWhichVecs; ++i) {
4820  whichVectorsDst(i) = this->whichVectors_[i];
4821  }
4822 
4823  // Use the destination MultiVector's LocalOrdinal type here.
4824  const LO srcNumWhichVecs = static_cast<LO> (src.whichVectors_.size ());
4825  Kokkos::View<LO*, HMDT> whichVectorsSrc ("srcWhichVecs", srcNumWhichVecs);
4826  for (LO i = 0; i < srcNumWhichVecs; ++i) {
4827  whichVectorsSrc(i) = src.whichVectors_[i];
4828  }
4829 
4830  // Copy from the selected vectors of src to the selected
4831  // vectors of dst, on the host.
4832  Details::localDeepCopy (this->template getLocalView<HMDT> (),
4833  src.template getLocalView<HMDT> (),
4834  this->isConstantStride (),
4835  src.isConstantStride (),
4836  whichVectorsDst, whichVectorsSrc);
4837 
4838  // We can't sync src and repeat the above copy on the
4839  // host, so sync dst back to the host.
4840  //
4841  // FIXME (mfh 29 Jul 2014) This may overwrite columns that
4842  // don't actually belong to dst's view.
4843  this->template sync<HMDT> ();
4844  }
4845  else { // copy from the device version of src
4846  // whichVectorsDst tells the kernel which columns of dst
4847  // to copy. Fill it on the host, and sync to device.
4848  const LO dstNumWhichVecs = static_cast<LO> (this->whichVectors_.size ());
4849  Kokkos::DualView<LO*, DT> whichVecsDst ("MV::deep_copy::whichVecsDst",
4850  dstNumWhichVecs);
4851  whichVecsDst.template modify<HMDT> ();
4852  for (LO i = 0; i < dstNumWhichVecs; ++i) {
4853  whichVecsDst.h_view(i) = static_cast<LO> (this->whichVectors_[i]);
4854  }
4855  // Sync the host version of whichVecsDst to the device.
4856  whichVecsDst.template sync<DT> ();
4857 
4858  // whichVectorsSrc tells the kernel which vectors
4859  // (columns) of src to copy. Fill it on the host, and
4860  // sync to device. Use the destination MultiVector's
4861  // LocalOrdinal type here.
4862  const LO srcNumWhichVecs = static_cast<LO> (src.whichVectors_.size ());
4863  Kokkos::DualView<LO*, DT> whichVecsSrc ("MV::deep_copy::whichVecsSrc",
4864  srcNumWhichVecs);
4865  whichVecsSrc.template modify<HMDT> ();
4866  for (LO i = 0; i < srcNumWhichVecs; ++i) {
4867  whichVecsSrc.h_view(i) = static_cast<LO> (src.whichVectors_[i]);
4868  }
4869  // Sync the host version of whichVecsSrc to the device.
4870  whichVecsSrc.template sync<DT> ();
4871 
4872  // Copy from the selected vectors of src to the selected
4873  // vectors of dst, on the device.
4874  Details::localDeepCopy (this->template getLocalView<DT> (),
4875  src.template getLocalView<DT> (),
4876  this->isConstantStride (),
4877  src.isConstantStride (),
4878  whichVecsDst.d_view, whichVecsSrc.d_view);
4879  }
4880  }
4881  }
4882  }
4883  }
4884 
4885  template <class Scalar, class LO, class GO, class NT, const bool classic>
4886  Teuchos::RCP<MultiVector<Scalar, LO, GO, NT, classic> >
4887  createMultiVector (const Teuchos::RCP<const Map<LO, GO, NT> >& map,
4888  size_t numVectors)
4889  {
4891  return Teuchos::rcp (new MV (map, numVectors));
4892  }
4893 
4894  template <class ST, class LO, class GO, class NT, const bool classic>
4895  MultiVector<ST, LO, GO, NT, classic>
4897  {
4899  MV cpy (src.getMap (), src.getNumVectors (), false);
4900  cpy.assign (src);
4901  return cpy;
4902  }
4903 
4904 } // namespace Tpetra
4905 
4906 //
4907 // Explicit instantiation macro
4908 //
4909 // Must be expanded from within the Tpetra namespace!
4910 //
4911 
4912 #define TPETRA_MULTIVECTOR_INSTANT(SCALAR,LO,GO,NODE) \
4913  template class MultiVector< SCALAR , LO , GO , NODE >; \
4914  template MultiVector< SCALAR , LO , GO , NODE > createCopy( const MultiVector< SCALAR , LO , GO , NODE >& src); \
4915  template Teuchos::RCP<MultiVector< SCALAR , LO , GO , NODE > > createMultiVector (const Teuchos::RCP<const Map<LO, GO, NODE> >& map, size_t numVectors);
4916 
4917 #endif // TPETRA_MULTIVECTOR_DEF_HPP
Namespace Tpetra contains the class and methods constituting the Tpetra library.
size_t getStride() const
Stride between columns in the multivector.
device_type::execution_space execution_space
The Kokkos execution space.
void deep_copy(MultiVector< DS, DL, DG, DN, dstClassic > &dst, const MultiVector< SS, SL, SG, SN, srcClassic > &src)
Copy the contents of the MultiVector src into dst.
MultiVector< ST, LO, GO, NT, classic > createCopy(const MultiVector< ST, LO, GO, NT, classic > &src)
Return a deep copy of the given MultiVector.
MultiVector< ST, LO, GO, NT, classic > createCopy(const MultiVector< ST, LO, GO, NT, classic > &src)
Return a deep copy of the given MultiVector.
Teuchos::RCP< MultiVector< Scalar, LocalOrdinal, GlobalOrdinal, Node, classic > > createMultiVector(const Teuchos::RCP< const Map< LocalOrdinal, GlobalOrdinal, Node > > &map, const size_t numVectors)
Nonmember MultiVector "constructor": Create a MultiVector from a given Map.
Teuchos::Array< size_t > whichVectors_
Indices of columns this multivector is viewing.
One or more distributed dense vectors.
void deep_copy(MultiVector< DS, DL, DG, DN, dstClassic > &dst, const MultiVector< SS, SL, SG, SN, srcClassic > &src)
Copy the contents of the MultiVector src into dst.
void removeEmptyProcessesInPlace(Teuchos::RCP< DistObjectType > &input, const Teuchos::RCP< const Map< typename DistObjectType::local_ordinal_type, typename DistObjectType::global_ordinal_type, typename DistObjectType::node_type > > &newMap)
Remove processes which contain no elements in this object&#39;s Map.
EWhichNormImpl
Input argument for localNormImpl() (which see).
dual_view_type view_
The Kokkos::DualView containing the MultiVector&#39;s data.
Node::device_type device_type
The Kokkos Device type.
size_t getNodeNumElements() const
The number of elements belonging to the calling process.
void gathervPrint(std::ostream &out, const std::string &s, const Teuchos::Comm< int > &comm)
On Process 0 in the given communicator, print strings from each process in that communicator, in rank order.
size_t global_size_t
Global size_t object.
Insert new values that don&#39;t currently exist.
Kokkos::DualView< T *, DT > getDualViewCopyFromArrayView(const Teuchos::ArrayView< const T > &x_av, const char label[], const bool leaveOnHost)
Get a 1-D Kokkos::DualView which is a deep copy of the input Teuchos::ArrayView (which views host mem...
bool isConstantStride() const
Whether this multivector has constant stride between columns.
size_t getLocalLength() const
Local number of rows on the calling process.
Sets up and executes a communication plan for a Tpetra DistObject.
CombineMode
Rule for combining data in an Import or Export.
Sum new values into existing values.
global_size_t getGlobalLength() const
Global number of rows in the multivector.
virtual Teuchos::RCP< const map_type > getMap() const
The Map describing the parallel distribution of this object.
Replace old value with maximum of magnitudes of old and new values.
Abstract base class for objects that can be the source of an Import or Export operation.
Replace existing values with new values.
Kokkos::Details::ArithTraits< Scalar >::val_type impl_scalar_type
The type used internally in place of Scalar.
KOKKOS_INLINE_FUNCTION void GEMM(const char transA[], const char transB[], const CoefficientType &alpha, const ViewType1 &A, const ViewType2 &B, const CoefficientType &beta, const ViewType3 &C)
Small dense matrix-matrix multiply: C := alpha*A*B + beta*C
Kokkos::Details::ArithTraits< impl_scalar_type >::mag_type mag_type
Type of a norm result.
Kokkos::DualView< impl_scalar_type **, Kokkos::LayoutLeft, typename execution_space::execution_space > dual_view_type
Kokkos::DualView specialization used by this class.
Describes a parallel distribution of objects over processes.
size_t getNumVectors() const
Number of columns in the multivector.
Class that provides GEMM for a particular Kokkos Device.
A distributed dense vector.
Stand-alone utility functions and macros.
void assign(const MultiVector< Scalar, LocalOrdinal, GlobalOrdinal, Node, classic > &src)
Copy the contents of src into *this (deep copy).
MultiVector()
Default constructor: makes a MultiVector with no rows or columns.
size_t getOrigNumLocalRows() const
"Original" number of rows in the (local) data.
EWhichNorm
Input argument for normImpl() (which see).
bool isDistributed() const
Whether this is a globally distributed object.
dual_view_type origView_
The "original view" of the MultiVector&#39;s data.