Tpetra parallel linear algebra  Version of the Day
Tpetra_DistObject_def.hpp
Go to the documentation of this file.
1 // @HEADER
2 // ***********************************************************************
3 //
4 // Tpetra: Templated Linear Algebra Services Package
5 // Copyright (2008) Sandia Corporation
6 //
7 // Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
8 // the U.S. Government retains certain rights in this software.
9 //
10 // Redistribution and use in source and binary forms, with or without
11 // modification, are permitted provided that the following conditions are
12 // met:
13 //
14 // 1. Redistributions of source code must retain the above copyright
15 // notice, this list of conditions and the following disclaimer.
16 //
17 // 2. Redistributions in binary form must reproduce the above copyright
18 // notice, this list of conditions and the following disclaimer in the
19 // documentation and/or other materials provided with the distribution.
20 //
21 // 3. Neither the name of the Corporation nor the names of the
22 // contributors may be used to endorse or promote products derived from
23 // this software without specific prior written permission.
24 //
25 // THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
26 // EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
27 // IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
28 // PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
29 // CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
30 // EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
31 // PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
32 // PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
33 // LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
34 // NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
35 // SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
36 //
37 // Questions? Contact Michael A. Heroux (maherou@sandia.gov)
38 //
39 // ************************************************************************
40 // @HEADER
41 
42 #ifndef TPETRA_DISTOBJECT_DEF_HPP
43 #define TPETRA_DISTOBJECT_DEF_HPP
44 
52 
53 #include "Tpetra_Distributor.hpp"
54 
55 namespace Tpetra {
56 
57  template <class Packet, class LocalOrdinal, class GlobalOrdinal, class Node, const bool classic>
59  DistObject (const Teuchos::RCP<const map_type>& map) :
60  map_ (map)
61  {
62 #ifdef HAVE_TPETRA_TRANSFER_TIMERS
63  using Teuchos::RCP;
64  using Teuchos::Time;
65  using Teuchos::TimeMonitor;
66 
67  RCP<Time> doXferTimer =
68  TimeMonitor::lookupCounter ("Tpetra::DistObject::doTransfer");
69  if (doXferTimer.is_null ()) {
70  doXferTimer =
71  TimeMonitor::getNewCounter ("Tpetra::DistObject::doTransfer");
72  }
73  doXferTimer_ = doXferTimer;
74 
75  RCP<Time> copyAndPermuteTimer =
76  TimeMonitor::lookupCounter ("Tpetra::DistObject::copyAndPermute");
77  if (copyAndPermuteTimer.is_null ()) {
78  copyAndPermuteTimer =
79  TimeMonitor::getNewCounter ("Tpetra::DistObject::copyAndPermute");
80  }
81  copyAndPermuteTimer_ = copyAndPermuteTimer;
82 
83  RCP<Time> packAndPrepareTimer =
84  TimeMonitor::lookupCounter ("Tpetra::DistObject::packAndPrepare");
85  if (packAndPrepareTimer.is_null ()) {
86  packAndPrepareTimer =
87  TimeMonitor::getNewCounter ("Tpetra::DistObject::packAndPrepare");
88  }
89  packAndPrepareTimer_ = packAndPrepareTimer;
90 
91  RCP<Time> doPostsAndWaitsTimer =
92  TimeMonitor::lookupCounter ("Tpetra::DistObject::doPostsAndWaits");
93  if (doPostsAndWaitsTimer.is_null ()) {
94  doPostsAndWaitsTimer =
95  TimeMonitor::getNewCounter ("Tpetra::DistObject::doPostsAndWaits");
96  }
97  doPostsAndWaitsTimer_ = doPostsAndWaitsTimer;
98 
99  RCP<Time> unpackAndCombineTimer =
100  TimeMonitor::lookupCounter ("Tpetra::DistObject::unpackAndCombine");
101  if (unpackAndCombineTimer.is_null ()) {
102  unpackAndCombineTimer =
103  TimeMonitor::getNewCounter ("Tpetra::DistObject::unpackAndCombine");
104  }
105  unpackAndCombineTimer_ = unpackAndCombineTimer;
106 #endif // HAVE_TPETRA_TRANSFER_TIMERS
107  }
108 
109  template <class Packet, class LocalOrdinal, class GlobalOrdinal, class Node, const bool classic>
112  : map_ (rhs.map_)
113  {}
114 
115  template <class Packet, class LocalOrdinal, class GlobalOrdinal, class Node, const bool classic>
118  {}
119 
120  template <class Packet, class LocalOrdinal, class GlobalOrdinal, class Node, const bool classic>
121  std::string
123  description () const
124  {
125  using Teuchos::TypeNameTraits;
126 
127  std::ostringstream os;
128  os << "\"Tpetra::DistObject\": {"
129  << "Packet: " << TypeNameTraits<packet_type>::name ()
130  << ", LocalOrdinal: " << TypeNameTraits<local_ordinal_type>::name ()
131  << ", GlobalOrdinal: " << TypeNameTraits<global_ordinal_type>::name ()
132  << ", Node: " << TypeNameTraits<Node>::name ();
133  if (this->getObjectLabel () != "") {
134  os << "Label: \"" << this->getObjectLabel () << "\"";
135  }
136  os << "}";
137  return os.str ();
138  }
139 
140  template <class Packet, class LocalOrdinal, class GlobalOrdinal, class Node, const bool classic>
141  void
143  describe (Teuchos::FancyOStream &out,
144  const Teuchos::EVerbosityLevel verbLevel) const
145  {
146  using Teuchos::rcpFromRef;
147  using Teuchos::TypeNameTraits;
148  using std::endl;
149  const Teuchos::EVerbosityLevel vl = (verbLevel == Teuchos::VERB_DEFAULT) ?
150  Teuchos::VERB_LOW : verbLevel;
151  Teuchos::RCP<const Teuchos::Comm<int> > comm = this->getMap ()->getComm ();
152  const int myRank = comm.is_null () ? 0 : comm->getRank ();
153  const int numProcs = comm.is_null () ? 1 : comm->getSize ();
154 
155  if (vl != Teuchos::VERB_NONE) {
156  Teuchos::OSTab tab0 (out);
157  if (myRank == 0) {
158  out << "\"Tpetra::DistObject\":" << endl;
159  }
160  Teuchos::OSTab tab1 (out);
161  if (myRank == 0) {
162  out << "Template parameters:" << endl;
163  {
164  Teuchos::OSTab tab2 (out);
165  out << "Packet: " << TypeNameTraits<packet_type>::name () << endl
166  << "LocalOrdinal: " << TypeNameTraits<local_ordinal_type>::name () << endl
167  << "GlobalOrdinal: " << TypeNameTraits<global_ordinal_type>::name () << endl
168  << "Node: " << TypeNameTraits<node_type>::name () << endl;
169  }
170  if (this->getObjectLabel () != "") {
171  out << "Label: \"" << this->getObjectLabel () << "\"" << endl;
172  }
173  } // if myRank == 0
174 
175  // Describe the Map.
176  {
177  if (myRank == 0) {
178  out << "Map:" << endl;
179  }
180  Teuchos::OSTab tab2 (out);
181  map_->describe (out, vl);
182  }
183 
184  // At verbosity > VERB_LOW, each process prints something.
185  if (vl > Teuchos::VERB_LOW) {
186  for (int p = 0; p < numProcs; ++p) {
187  if (myRank == p) {
188  out << "Process " << myRank << ":" << endl;
189  Teuchos::OSTab tab2 (out);
190  out << "Export buffer size (in packets): "
191  << exports_.dimension_0 ()
192  << endl
193  << "Import buffer size (in packets): "
194  << imports_.dimension_0 ()
195  << endl;
196  }
197  if (! comm.is_null ()) {
198  comm->barrier (); // give output time to finish
199  comm->barrier ();
200  comm->barrier ();
201  }
202  } // for each process rank p
203  } // if vl > VERB_LOW
204  } // if vl != VERB_NONE
205  }
206 
207  template <class Packet, class LocalOrdinal, class GlobalOrdinal, class Node, const bool classic>
208  void
210  removeEmptyProcessesInPlace (const Teuchos::RCP<const map_type>& newMap)
211  {
212  TEUCHOS_TEST_FOR_EXCEPTION(true, std::logic_error,
213  "Tpetra::DistObject::removeEmptyProcessesInPlace: Not implemented");
214  }
215 
216  /* These are provided in base DistObject template
217  template<class DistObjectType>
218  void
219  removeEmptyProcessesInPlace (Teuchos::RCP<DistObjectType>& input,
220  const Teuchos::RCP<const Map<typename DistObjectType::local_ordinal_type,
221  typename DistObjectType::global_ordinal_type,
222  typename DistObjectType::node_type> >& newMap)
223  {
224  input->removeEmptyProcessesInPlace (newMap);
225  if (newMap.is_null ()) { // my process is excluded
226  input = Teuchos::null;
227  }
228  }
229 
230  template<class DistObjectType>
231  void
232  removeEmptyProcessesInPlace (Teuchos::RCP<DistObjectType>& input)
233  {
234  using Teuchos::RCP;
235  typedef typename DistObjectType::local_ordinal_type LO;
236  typedef typename DistObjectType::global_ordinal_type GO;
237  typedef typename DistObjectType::node_type NT;
238  typedef Map<LO, GO, NT> map_type;
239 
240  RCP<const map_type> newMap = input->getMap ()->removeEmptyProcesses ();
241  removeEmptyProcessesInPlace<DistObjectType> (input, newMap);
242  }
243  */
244 
245  template <class Packet, class LocalOrdinal, class GlobalOrdinal, class Node, const bool classic>
246  void
248  doImport (const SrcDistObject& source,
250  CombineMode CM)
251  {
252 #ifdef HAVE_TPETRA_DEBUG
253  TEUCHOS_TEST_FOR_EXCEPTION(*getMap() != *importer.getTargetMap(),
254  std::invalid_argument, "doImport: The target DistObject's Map is not "
255  "identical to the Import's target Map.");
256  {
257  const this_type* srcDistObj = dynamic_cast<const this_type*> (&source);
258  TEUCHOS_TEST_FOR_EXCEPTION(
259  srcDistObj != NULL && * (srcDistObj->getMap ()) != *importer.getSourceMap(),
260  std::invalid_argument, "doImport: The source is a DistObject, yet its "
261  "Map is not identical to the Import's source Map.");
262  }
263 #endif // HAVE_TPETRA_DEBUG
264  size_t numSameIDs = importer.getNumSameIDs ();
265 
266  typedef Teuchos::ArrayView<const LocalOrdinal> view_type;
267  const view_type exportLIDs = importer.getExportLIDs();
268  const view_type remoteLIDs = importer.getRemoteLIDs();
269  const view_type permuteToLIDs = importer.getPermuteToLIDs();
270  const view_type permuteFromLIDs = importer.getPermuteFromLIDs();
271  this->doTransfer (source, CM, numSameIDs, permuteToLIDs, permuteFromLIDs,
272  remoteLIDs, exportLIDs, importer.getDistributor (),
273  DoForward);
274  }
275 
276  template <class Packet, class LocalOrdinal, class GlobalOrdinal, class Node, const bool classic>
277  void
279  doExport (const SrcDistObject& source,
281  CombineMode CM)
282  {
283 #ifdef HAVE_TPETRA_DEBUG
284  TEUCHOS_TEST_FOR_EXCEPTION(
285  *getMap() != *exporter.getTargetMap(), std::invalid_argument,
286  "doExport: The target DistObject's Map is not identical to the Export's "
287  "target Map.");
288  {
289  const this_type* srcDistObj = dynamic_cast<const this_type*> (&source);
290  TEUCHOS_TEST_FOR_EXCEPTION(
291  srcDistObj != NULL && * (srcDistObj->getMap ()) != *exporter.getSourceMap(),
292  std::invalid_argument, "doExport: The source is a DistObject, yet its "
293  "Map is not identical to the Export's source Map.");
294  }
295 #endif // HAVE_TPETRA_DEBUG
296  size_t numSameIDs = exporter.getNumSameIDs();
297 
298  typedef Teuchos::ArrayView<const LocalOrdinal> view_type;
299  view_type exportLIDs = exporter.getExportLIDs();
300  view_type remoteLIDs = exporter.getRemoteLIDs();
301  view_type permuteToLIDs = exporter.getPermuteToLIDs();
302  view_type permuteFromLIDs = exporter.getPermuteFromLIDs();
303  doTransfer (source, CM, numSameIDs, permuteToLIDs, permuteFromLIDs, remoteLIDs,
304  exportLIDs, exporter.getDistributor (), DoForward);
305  }
306 
307  template <class Packet, class LocalOrdinal, class GlobalOrdinal, class Node, const bool classic>
308  void
310  doImport (const SrcDistObject& source,
312  CombineMode CM)
313  {
314 #ifdef HAVE_TPETRA_DEBUG
315  TEUCHOS_TEST_FOR_EXCEPTION(
316  *getMap() != *exporter.getSourceMap(), std::invalid_argument,
317  "doImport (reverse mode): The target DistObject's Map is not identical "
318  "to the Export's source Map.");
319  {
320  const this_type* srcDistObj = dynamic_cast<const this_type*> (&source);
321  TEUCHOS_TEST_FOR_EXCEPTION(
322  srcDistObj != NULL && * (srcDistObj->getMap ()) != *exporter.getTargetMap(),
323  std::invalid_argument,
324  "doImport (reverse mode): The source is a DistObject, yet its "
325  "Map is not identical to the Export's target Map.");
326  }
327 #endif // HAVE_TPETRA_DEBUG
328  size_t numSameIDs = exporter.getNumSameIDs();
329 
330  typedef Teuchos::ArrayView<const LocalOrdinal> view_type;
331  view_type exportLIDs = exporter.getRemoteLIDs();
332  view_type remoteLIDs = exporter.getExportLIDs();
333  view_type permuteToLIDs = exporter.getPermuteFromLIDs();
334  view_type permuteFromLIDs = exporter.getPermuteToLIDs();
335  doTransfer (source, CM, numSameIDs, permuteToLIDs, permuteFromLIDs, remoteLIDs,
336  exportLIDs, exporter.getDistributor (), DoReverse);
337  }
338 
339  template <class Packet, class LocalOrdinal, class GlobalOrdinal, class Node, const bool classic>
340  void
342  doExport (const SrcDistObject& source,
344  CombineMode CM)
345  {
346 #ifdef HAVE_TPETRA_DEBUG
347  TEUCHOS_TEST_FOR_EXCEPTION(
348  *getMap() != *importer.getSourceMap(), std::invalid_argument,
349  "doExport (reverse mode): The target object's Map "
350  "is not identical to the Import's source Map.");
351  {
352  const this_type* srcDistObj = dynamic_cast<const this_type*> (&source);
353  TEUCHOS_TEST_FOR_EXCEPTION(
354  srcDistObj != NULL && * (srcDistObj->getMap ()) != *importer.getTargetMap(),
355  std::invalid_argument,
356  "doExport (reverse mode): The source is a DistObject, yet its "
357  "Map is not identical to the Import's target Map.");
358  }
359 #endif // HAVE_TPETRA_DEBUG
360  size_t numSameIDs = importer.getNumSameIDs();
361 
362  typedef Teuchos::ArrayView<const LocalOrdinal> view_type;
363  view_type exportLIDs = importer.getRemoteLIDs();
364  view_type remoteLIDs = importer.getExportLIDs();
365  view_type permuteToLIDs = importer.getPermuteFromLIDs();
366  view_type permuteFromLIDs = importer.getPermuteToLIDs();
367  doTransfer (source, CM, numSameIDs, permuteToLIDs, permuteFromLIDs, remoteLIDs,
368  exportLIDs, importer.getDistributor (), DoReverse);
369  }
370 
371  template <class Packet, class LocalOrdinal, class GlobalOrdinal, class Node, const bool classic>
372  bool
374  isDistributed () const {
375  return map_->isDistributed ();
376  }
377 
378  template <class Packet, class LocalOrdinal, class GlobalOrdinal, class Node, const bool classic>
379  size_t
382  return 0; // default implementation; subclasses may override
383  }
384 
385  template <class Packet, class LocalOrdinal, class GlobalOrdinal, class Node, const bool classic>
386  void
389  CombineMode CM,
390  size_t numSameIDs,
391  const Teuchos::ArrayView<const LocalOrdinal>& permuteToLIDs_,
392  const Teuchos::ArrayView<const LocalOrdinal>& permuteFromLIDs_,
393  const Teuchos::ArrayView<const LocalOrdinal>& remoteLIDs_,
394  const Teuchos::ArrayView<const LocalOrdinal>& exportLIDs_,
395  Distributor& distor,
396  ReverseOption revOp)
397  {
399  typedef LocalOrdinal LO;
400  typedef device_type DT;
401 
402  if (this->useNewInterface ()) {
403  const bool commOnHost = false;
404 
405  // Convert arguments to Kokkos::DualView. This currently
406  // involves deep copy, either to host or to device (depending on
407  // commOnHost). At some point, we need to change the interface
408  // of doTransfer so it takes DualView (or just View) rather than
409  // Teuchos::ArrayView, so that we won't need this deep copy.
410  //
411  // We don't need to sync the arguments. commOnHost determines
412  // where the most recent version lives.
413  Kokkos::DualView<LO*, DT> permuteToLIDs =
414  getDualViewCopyFromArrayView<LO, DT> (permuteToLIDs_,
415  "permuteToLIDs",
416  commOnHost);
417  Kokkos::DualView<LO*, DT> permuteFromLIDs =
418  getDualViewCopyFromArrayView<LO, DT> (permuteFromLIDs_,
419  "permuteFromLIDs",
420  commOnHost);
421  // No need to sync this. packAndPrepareNew will use it to
422  // determine where to pack (in host or device memory).
423  Kokkos::DualView<LO*, DT> remoteLIDs =
424  getDualViewCopyFromArrayView<LO, DT> (remoteLIDs_,
425  "remoteLIDs",
426  commOnHost);
427  Kokkos::DualView<LO*, DT> exportLIDs =
428  getDualViewCopyFromArrayView<LO, DT> (exportLIDs_,
429  "exportLIDs",
430  commOnHost);
431 
432  doTransferNew (src, CM, numSameIDs, permuteToLIDs, permuteFromLIDs,
433  remoteLIDs, exportLIDs, distor, revOp, commOnHost);
434  }
435  else {
436  doTransferOld (src, CM, numSameIDs, permuteToLIDs_, permuteFromLIDs_,
437  remoteLIDs_, exportLIDs_, distor, revOp);
438  }
439  }
440 
441  template <class Packet, class LocalOrdinal, class GlobalOrdinal, class Node, const bool classic>
442  void
444  reallocImportsIfNeeded (const size_t newSize, const bool debug)
445  {
446  if (static_cast<size_t> (imports_.dimension_0 ()) != newSize) {
447  if (debug) {
448  std::ostringstream os;
449  os << "*** Realloc imports_ from " << imports_.dimension_0 () << " to "
450  << newSize << std::endl;
451  std::cerr << os.str ();
452  }
453  // FIXME (mfh 28 Mar 2016, 25 Apr 2016) Fences around (UVM)
454  // allocations are for #227 debugging, but shouldn't be needed
455  // once #227 is fixed.
456  execution_space::fence ();
457  imports_ = decltype (imports_) ("imports", newSize);
458  execution_space::fence ();
459  TEUCHOS_TEST_FOR_EXCEPTION
460  (static_cast<size_t> (imports_.dimension_0 ()) != newSize,
461  std::logic_error, "DualView reallocation failed: "
462  "imports_.dimension_0() = " << imports_.dimension_0 ()
463  << " != " << newSize << ".");
464  }
465  }
466 
467  template <class Packet, class LocalOrdinal, class GlobalOrdinal, class Node, const bool classic>
468  void
470  doTransferOld (const SrcDistObject& src,
471  CombineMode CM,
472  size_t numSameIDs,
473  const Teuchos::ArrayView<const LocalOrdinal>& permuteToLIDs,
474  const Teuchos::ArrayView<const LocalOrdinal>& permuteFromLIDs,
475  const Teuchos::ArrayView<const LocalOrdinal>& remoteLIDs,
476  const Teuchos::ArrayView<const LocalOrdinal>& exportLIDs,
477  Distributor &distor,
478  ReverseOption revOp)
479  {
481  const bool debug = false;
482 
483 #ifdef HAVE_TPETRA_TRANSFER_TIMERS
484  Teuchos::TimeMonitor doXferMon (*doXferTimer_);
485 #endif // HAVE_TPETRA_TRANSFER_TIMERS
486 
487  TEUCHOS_TEST_FOR_EXCEPTION(
488  ! checkSizes (src), std::invalid_argument,
489  "Tpetra::DistObject::doTransfer(): checkSizes() indicates that the "
490  "destination object is not a legal target for redistribution from the "
491  "source object. This probably means that they do not have the same "
492  "dimensions. For example, MultiVectors must have the same number of "
493  "rows and columns.");
494  KokkosClassic::ReadWriteOption rwo = KokkosClassic::ReadWrite;
495  if (CM == INSERT || CM == REPLACE) {
496  const size_t numIDsToWrite = numSameIDs +
497  static_cast<size_t> (permuteToLIDs.size ()) +
498  static_cast<size_t> (remoteLIDs.size ());
499  if (numIDsToWrite == this->getMap ()->getNodeNumElements ()) {
500  // We're overwriting all of our local data in the destination
501  // object, so a write-only view suffices.
502  //
503  // FIXME (mfh 10 Apr 2012) This doesn't make sense for a
504  // CrsMatrix with a dynamic graph. INSERT mode could mean
505  // that we're adding new entries to the object, but we don't
506  // want to get rid of the old ones.
507  rwo = KokkosClassic::WriteOnly;
508  }
509  }
510  // Tell the source to create a read-only view of its data. On a
511  // discrete accelerator such as a GPU, this brings EVERYTHING from
512  // device memory to host memory.
513  //
514  // FIXME (mfh 23 Mar 2012) By passing in the list of GIDs (or
515  // rather, local LIDs to send) and packet counts, createViews()
516  // could create a "sparse view" that only brings in the necessary
517  // data from device to host memory.
518  const this_type* srcDistObj = dynamic_cast<const this_type*> (&src);
519  if (srcDistObj != NULL) {
520  srcDistObj->createViews ();
521  }
522 
523  // Tell the target to create a view of its data. Depending on
524  // rwo, this could be a write-only view or a read-and-write view.
525  // On a discrete accelerator such as a GPU, a write-only view only
526  // requires a transfer from host to device memory. A
527  // read-and-write view requires a two-way transfer. This has the
528  // same problem as createViews(): it transfers EVERYTHING, not
529  // just the necessary data.
530  //
531  // FIXME (mfh 23 Mar 2012) By passing in the list of GIDs (or
532  // rather, local LIDs into which to receive) and packet counts,
533  // createViewsNonConst() could create a "sparse view" that only
534  // transfers the necessary data.
535  this->createViewsNonConst (rwo);
536 
537  if (numSameIDs + permuteToLIDs.size()) {
538 #ifdef HAVE_TPETRA_TRANSFER_TIMERS
539  Teuchos::TimeMonitor copyAndPermuteMon (*copyAndPermuteTimer_);
540 #endif // HAVE_TPETRA_TRANSFER_TIMERS
541  // There is at least one GID to copy or permute.
542  copyAndPermute (src, numSameIDs, permuteToLIDs, permuteFromLIDs);
543  }
544 
545  // The method may return zero even if the implementation actually
546  // does have a constant number of packets per LID. However, if it
547  // returns nonzero, we may use this information to avoid
548  // (re)allocating num{Ex,Im}portPacketsPerLID_. packAndPrepare()
549  // will set this to its final value.
550  //
551  // We only need this if CM != ZERO, but it has to be lifted out of
552  // that scope because there are multiple tests for CM != ZERO.
553  size_t constantNumPackets = this->constantNumberOfPackets ();
554 
555  // We only need to pack communication buffers if the combine mode
556  // is not ZERO. A "ZERO combine mode" means that the results are
557  // the same as if we had received all zeros, and added them to the
558  // existing values. That means we don't need to communicate.
559  if (CM != ZERO) {
560  if (constantNumPackets == 0) {
561  // FIXME (mfh 25 Apr 2016) Fences around (UVM) allocations
562  // facilitate #227 debugging, but we shouldn't need them.
563  execution_space::fence ();
564  numExportPacketsPerLID_ =
565  decltype (numExportPacketsPerLID_) ("numExportPacketsPerLID",
566  exportLIDs.size ());
567  execution_space::fence ();
568  numImportPacketsPerLID_ =
569  decltype (numImportPacketsPerLID_) ("numImportPacketsPerLID",
570  remoteLIDs.size ());
571  execution_space::fence ();
572  }
573 
574  {
575 #ifdef HAVE_TPETRA_TRANSFER_TIMERS
576  Teuchos::TimeMonitor packAndPrepareMon (*packAndPrepareTimer_);
577 #endif // HAVE_TPETRA_TRANSFER_TIMERS
578  // Ask the source to pack data. Also ask it whether there are a
579  // constant number of packets per element (constantNumPackets is
580  // an output argument). If there are, constantNumPackets will
581  // come back nonzero. Otherwise, the source will fill the
582  // numExportPacketsPerLID_ array.
583  numExportPacketsPerLID_.template modify<Kokkos::HostSpace> ();
584  Teuchos::ArrayView<size_t> numExportPacketsPerLID =
585  getArrayViewFromDualView (numExportPacketsPerLID_);
586 
587  // FIXME (mfh 26 Apr 2016) For backwards compatibility, use
588  // the old packAndPrepare interface that takes and resizes the
589  // exports buffer as a Teuchos::Array<packet_type>. Then,
590  // copy out that buffer into the host version of exports_.
591 
592  Teuchos::Array<packet_type> exportsOld;
593  packAndPrepare (src, exportLIDs, exportsOld, numExportPacketsPerLID,
594  constantNumPackets, distor);
595  const size_t exportsLen = static_cast<size_t> (exportsOld.size ());
596  if (static_cast<size_t> (exports_.dimension_0 ()) != exportsLen) {
597  // FIXME (mfh 26 Apr 2016) Fences around (UVM) allocations
598  // facilitate #227 debugging, but we shouldn't need them.
599  execution_space::fence ();
600  exports_ = decltype (exports_) ("exports", exportsLen);
601  execution_space::fence ();
602  }
603  Kokkos::View<const packet_type*, Kokkos::HostSpace,
604  Kokkos::MemoryUnmanaged> exportsOldK (exportsOld.getRawPtr (),
605  exportsLen);
606  exports_.template modify<Kokkos::HostSpace> ();
607  Kokkos::deep_copy (exports_.template view<Kokkos::HostSpace> (),
608  exportsOldK);
609  }
610  }
611 
612  // We don't need the source's data anymore, so it can let go of
613  // its views. On an accelerator device with a separate memory
614  // space (like a GPU), this frees host memory, since device memory
615  // has the "master" version of the data.
616  if (srcDistObj != NULL) {
617  srcDistObj->releaseViews ();
618  }
619 
620  // We only need to send data if the combine mode is not ZERO.
621  if (CM != ZERO) {
622  if (constantNumPackets != 0) {
623  // There are a constant number of packets per element. We
624  // already know (from the number of "remote" (incoming)
625  // elements) how many incoming elements we expect, so we can
626  // resize the buffer accordingly.
627  const size_t rbufLen = remoteLIDs.size() * constantNumPackets;
628  if (debug) {
629  std::ostringstream os;
630  os << "*** doTransferOld: Const # packets: imports_.dimension_0() = "
631  << imports_.dimension_0 () << ", rbufLen = " << rbufLen
632  << std::endl;
633  std::cerr << os.str ();
634  }
635  reallocImportsIfNeeded (rbufLen, debug);
636  }
637 
638  // Do we need to do communication (via doPostsAndWaits)?
639  bool needCommunication = true;
640  if (revOp == DoReverse && ! isDistributed ()) {
641  needCommunication = false;
642  }
643  // FIXME (mfh 30 Jun 2013): Checking whether the source object
644  // is distributed requires a cast to DistObject. If it's not a
645  // DistObject, then I'm not quite sure what to do. Perhaps it
646  // would be more appropriate for SrcDistObject to have an
647  // isDistributed() method. For now, I'll just assume that we
648  // need to do communication unless the cast succeeds and the
649  // source is not distributed.
650  else if (revOp == DoForward && srcDistObj != NULL &&
651  ! srcDistObj->isDistributed ()) {
652  needCommunication = false;
653  }
654 
655  if (needCommunication) {
656  if (revOp == DoReverse) {
657 #ifdef HAVE_TPETRA_TRANSFER_TIMERS
658  Teuchos::TimeMonitor doPostsAndWaitsMon (*doPostsAndWaitsTimer_);
659 #endif // HAVE_TPETRA_TRANSFER_TIMERS
660  if (constantNumPackets == 0) { //variable num-packets-per-LID:
661  // First communicate the number of packets per LID to receive.
662 
663  // Make sure that host has the latest version, since we're
664  // using the version on host. If host has the latest
665  // version already, syncing to host does nothing.
666  numExportPacketsPerLID_.template sync<Kokkos::HostSpace> ();
667  Teuchos::ArrayView<const size_t> numExportPacketsPerLID =
668  getArrayViewFromDualView (numExportPacketsPerLID_);
669 
670  // numImportPacketsPerLID_ is the output array here, so
671  // mark it as modified. It's strictly output, so we don't
672  // have to sync from device.
673  //numImportPacketsPerLID_.template sync<Kokkos::HostSpace> ();
674  numImportPacketsPerLID_.template modify<Kokkos::HostSpace> ();
675  Teuchos::ArrayView<size_t> numImportPacketsPerLID =
676  getArrayViewFromDualView (numImportPacketsPerLID_);
677  distor.doReversePostsAndWaits (numExportPacketsPerLID, 1,
678  numImportPacketsPerLID);
679  size_t totalImportPackets = 0;
680  for (Array_size_type i = 0; i < numImportPacketsPerLID.size (); ++i) {
681  totalImportPackets += numImportPacketsPerLID[i];
682  }
683 
684  reallocImportsIfNeeded (totalImportPackets, debug);
685 
686  // We don't need to sync imports_, because it is only for
687  // output here. Similarly, we don't need to mark exports_
688  // as modified, since it is read only here. This legacy
689  // version of doTransfer only uses host arrays.
690  imports_.template modify<Kokkos::HostSpace> ();
691  Teuchos::ArrayView<packet_type> hostImports =
692  getArrayViewFromDualView (imports_);
693  exports_.template sync<Kokkos::HostSpace> ();
694  Teuchos::ArrayView<const packet_type> hostExports =
695  getArrayViewFromDualView (exports_);
696  distor.doReversePostsAndWaits (hostExports,
697  numExportPacketsPerLID,
698  hostImports,
699  numImportPacketsPerLID);
700  }
701  else {
702  // We don't need to sync imports_, because it is only for
703  // output here. Similarly, we don't need to mark exports_
704  // as modified, since it is read only here. This legacy
705  // version of doTransfer only uses host arrays.
706  imports_.template modify<Kokkos::HostSpace> ();
707  Teuchos::ArrayView<packet_type> hostImports =
708  getArrayViewFromDualView (imports_);
709  exports_.template sync<Kokkos::HostSpace> ();
710  Teuchos::ArrayView<const packet_type> hostExports =
711  getArrayViewFromDualView (exports_);
712  distor.doReversePostsAndWaits (hostExports,
713  constantNumPackets,
714  hostImports);
715  }
716  }
717  else { // revOp == DoForward
718 #ifdef HAVE_TPETRA_TRANSFER_TIMERS
719  Teuchos::TimeMonitor doPostsAndWaitsMon (*doPostsAndWaitsTimer_);
720 #endif // HAVE_TPETRA_TRANSFER_TIMERS
721  if (constantNumPackets == 0) { //variable num-packets-per-LID:
722  // First communicate the number of packets per LID to receive.
723 
724  // Make sure that host has the latest version, since we're
725  // using the version on host. If host has the latest
726  // version already, syncing to host does nothing.
727  numExportPacketsPerLID_.template sync<Kokkos::HostSpace> ();
728  Teuchos::ArrayView<const size_t> numExportPacketsPerLID =
729  getArrayViewFromDualView (numExportPacketsPerLID_);
730 
731  // numImportPacketsPerLID_ is the output array here, so
732  // mark it as modified. It's strictly output, so we don't
733  // have to sync from device.
734  //numImportPacketsPerLID_.template sync<Kokkos::HostSpace> ();
735  numImportPacketsPerLID_.template modify<Kokkos::HostSpace> ();
736  Teuchos::ArrayView<size_t> numImportPacketsPerLID =
737  getArrayViewFromDualView (numImportPacketsPerLID_);
738  distor.doPostsAndWaits (numExportPacketsPerLID, 1,
739  numImportPacketsPerLID);
740  size_t totalImportPackets = 0;
741  for (Array_size_type i = 0; i < numImportPacketsPerLID.size (); ++i) {
742  totalImportPackets += numImportPacketsPerLID[i];
743  }
744 
745  reallocImportsIfNeeded (totalImportPackets, debug);
746 
747  // We don't need to sync imports_, because it is only for
748  // output here. Similarly, we don't need to mark exports_
749  // as modified, since it is read only here. This legacy
750  // version of doTransfer only uses host arrays.
751  imports_.template modify<Kokkos::HostSpace> ();
752  Teuchos::ArrayView<packet_type> hostImports =
753  getArrayViewFromDualView (imports_);
754  exports_.template sync<Kokkos::HostSpace> ();
755  Teuchos::ArrayView<const packet_type> hostExports =
756  getArrayViewFromDualView (exports_);
757  distor.doPostsAndWaits (hostExports,
758  numExportPacketsPerLID,
759  hostImports,
760  numImportPacketsPerLID);
761  }
762  else {
763  // We don't need to sync imports_, because it is only for
764  // output here. Similarly, we don't need to mark exports_
765  // as modified, since it is read only here. This legacy
766  // version of doTransfer only uses host arrays.
767  imports_.template modify<Kokkos::HostSpace> ();
768  Teuchos::ArrayView<packet_type> hostImports =
769  getArrayViewFromDualView (imports_);
770  exports_.template sync<Kokkos::HostSpace> ();
771  Teuchos::ArrayView<const packet_type> hostExports =
772  getArrayViewFromDualView (exports_);
773  distor.doPostsAndWaits (hostExports,
774  constantNumPackets,
775  hostImports);
776  }
777  }
778  {
779 #ifdef HAVE_TPETRA_TRANSFER_TIMERS
780  Teuchos::TimeMonitor unpackAndCombineMon (*unpackAndCombineTimer_);
781 #endif // HAVE_TPETRA_TRANSFER_TIMERS
782 
783  // We don't need to sync imports_, because it is only for
784  // output here. This legacy version of doTransfer only uses
785  // host arrays.
786  imports_.template modify<Kokkos::HostSpace> ();
787  Teuchos::ArrayView<packet_type> hostImports =
788  getArrayViewFromDualView (imports_);
789  // NOTE (mfh 25 Apr 2016) unpackAndCombine doesn't actually
790  // change its numImportPacketsPerLID argument, so we don't
791  // have to mark it modified here.
792  numImportPacketsPerLID_.template sync<Kokkos::HostSpace> ();
793  // FIXME (mfh 25 Apr 2016) unpackAndCombine doesn't actually
794  // change its numImportPacketsPerLID argument, so we should
795  // be able to use a const Teuchos::ArrayView here.
796  Teuchos::ArrayView<size_t> numImportPacketsPerLID =
797  getArrayViewFromDualView (numImportPacketsPerLID_);
798  unpackAndCombine (remoteLIDs, hostImports, numImportPacketsPerLID,
799  constantNumPackets, distor, CM);
800  }
801  }
802  } // if (CM != ZERO)
803 
804  this->releaseViews ();
805  }
806 
807  template <class Packet, class LocalOrdinal, class GlobalOrdinal, class Node, const bool classic>
808  void
809  DistObject<Packet, LocalOrdinal, GlobalOrdinal, Node, classic>::
810  doTransferNew (const SrcDistObject& src,
811  const CombineMode CM,
812  const size_t numSameIDs,
813  const Kokkos::DualView<const local_ordinal_type*,
814  device_type>& permuteToLIDs,
815  const Kokkos::DualView<const local_ordinal_type*,
816  device_type>& permuteFromLIDs,
817  const Kokkos::DualView<const local_ordinal_type*,
818  device_type>& remoteLIDs,
819  const Kokkos::DualView<const local_ordinal_type*,
820  device_type>& exportLIDs,
821  Distributor& distor,
822  const ReverseOption revOp,
823  const bool commOnHost)
824  {
826  using Kokkos::Compat::getArrayView;
827  using Kokkos::Compat::getConstArrayView;
828  using Kokkos::Compat::getKokkosViewDeepCopy;
829  using Kokkos::Compat::create_const_view;
830  typedef LocalOrdinal LO;
831  typedef typename Kokkos::DualView<LO*,
832  device_type>::t_dev::memory_space dev_memory_space;
833  typedef typename Kokkos::DualView<LO*,
834  device_type>::t_host::memory_space host_memory_space;
835  const bool debug = false;
836 
837  if (debug) {
838  std::ostringstream os;
839  os << ">>> DistObject::doTransferNew: remoteLIDs.size() = "
840  << remoteLIDs.dimension_0 () << std::endl;
841  std::cerr << os.str ();
842  }
843 
844 #ifdef HAVE_TPETRA_TRANSFER_TIMERS
845  Teuchos::TimeMonitor doXferMon (*doXferTimer_);
846 #endif // HAVE_TPETRA_TRANSFER_TIMERS
847 
848  if (debug) {
849  std::cerr << ">>> 1. checkSizes" << std::endl;
850  }
851 
852  TEUCHOS_TEST_FOR_EXCEPTION(
853  ! checkSizes (src), std::invalid_argument,
854  "Tpetra::DistObject::doTransfer(): checkSizes() indicates that the "
855  "destination object is not a legal target for redistribution from the "
856  "source object. This probably means that they do not have the same "
857  "dimensions. For example, MultiVectors must have the same number of "
858  "rows and columns.");
859 
860  // NOTE (mfh 26 Apr 2016) Chris Baker's implementation understood
861  // that if CM == INSERT || CM == REPLACE, the target object could
862  // be write only. We don't optimize for that here.
863 
864  if (debug) {
865  std::cerr << ">>> 2. copyAndPermuteNew" << std::endl;
866  }
867 
868  if (numSameIDs + permuteToLIDs.dimension_0 () != 0) {
869  // There is at least one GID to copy or permute.
870 #ifdef HAVE_TPETRA_TRANSFER_TIMERS
871  Teuchos::TimeMonitor copyAndPermuteMon (*copyAndPermuteTimer_);
872 #endif // HAVE_TPETRA_TRANSFER_TIMERS
873  copyAndPermuteNew (src, numSameIDs, permuteToLIDs, permuteFromLIDs);
874  }
875 
876  // The method may return zero even if the implementation actually
877  // does have a constant number of packets per LID. However, if it
878  // returns nonzero, we may use this information to avoid
879  // (re)allocating num{Ex,Im}portPacketsPerLID_. packAndPrepare()
880  // will set this to its final value.
881  //
882  // We only need this if CM != ZERO, but it has to be lifted out of
883  // that scope because there are multiple tests for CM != ZERO.
884  size_t constantNumPackets = this->constantNumberOfPackets ();
885 
886  // We only need to pack communication buffers if the combine mode
887  // is not ZERO. A "ZERO combine mode" means that the results are
888  // the same as if we had received all zeros, and added them to the
889  // existing values. That means we don't need to communicate.
890  if (CM != ZERO) {
891  if (constantNumPackets == 0) {
892  if (debug) {
893  std::cerr << ">>> 3. Allocate num{Ex,Im}portPacketsPerLID" << std::endl;
894  }
895 
896  // Don't "realloc" unless you really need to. That will avoid
897  // a bit of time for reinitializing the Views' data.
898 
899  if (numExportPacketsPerLID_.dimension_0 () != exportLIDs.dimension_0 ()) {
900  // FIXME (mfh 25 Apr 2016) Fences around (UVM) allocations
901  // facilitate #227 debugging, but shouldn't be needed.
902  execution_space::fence ();
903  numExportPacketsPerLID_ =
904  decltype (numExportPacketsPerLID_) ("numExportPacketsPerLID",
905  exportLIDs.dimension_0 ());
906  execution_space::fence ();
907  }
908  if (numImportPacketsPerLID_.dimension_0 () != remoteLIDs.dimension_0 ()) {
909  // FIXME (mfh 25 Apr 2016) Fences around (UVM) allocations
910  // facilitate #227 debugging, but shouldn't be needed.
911  execution_space::fence ();
912  numImportPacketsPerLID_ =
913  decltype (numImportPacketsPerLID_) ("numImportPacketsPerLID",
914  remoteLIDs.dimension_0 ());
915  execution_space::fence ();
916  }
917  }
918 
919  if (debug) {
920  std::cerr << ">>> 4. packAndPrepareNew" << std::endl;
921  }
922 
923  {
924 #ifdef HAVE_TPETRA_TRANSFER_TIMERS
925  Teuchos::TimeMonitor packAndPrepareMon (*packAndPrepareTimer_);
926 #endif // HAVE_TPETRA_TRANSFER_TIMERS
927 
928  if (debug) {
929  std::ostringstream os;
930  const int myRank = this->getMap ()->getComm ()->getRank ();
931  os << ">>> (Proc " << myRank << ") 5.0. Before packAndPrepareNew, "
932  "exports_.dimension_0()=" << exports_.dimension_0 () << std::endl;
933  std::cerr << os.str ();
934  }
935  // Ask the source to pack data. Also ask it whether there are
936  // a constant number of packets per element
937  // (constantNumPackets is an output argument). If there are,
938  // constantNumPackets will come back nonzero. Otherwise, the
939  // source will fill the numExportPacketsPerLID_ array.
940  packAndPrepareNew (src, exportLIDs, exports_, numExportPacketsPerLID_,
941  constantNumPackets, distor);
942  if (debug) {
943  std::ostringstream os;
944  const int myRank = this->getMap ()->getComm ()->getRank ();
945  os << ">>> (Proc " << myRank << ") 5.0. After packAndPrepareNew, "
946  "exports_.dimension_0()=" << exports_.dimension_0 () << std::endl;
947  std::cerr << os.str ();
948  }
949  }
950  }
951 
952  // We only need to send data if the combine mode is not ZERO.
953  if (CM != ZERO) {
954  if (constantNumPackets != 0) {
955  if (debug) {
956  std::cerr << ">>> 6. Realloc imports_" << std::endl;
957  }
958  // There are a constant number of packets per element. We
959  // already know (from the number of "remote" (incoming)
960  // elements) how many incoming elements we expect, so we can
961  // resize the buffer accordingly.
962  const size_t rbufLen = remoteLIDs.dimension_0 () * constantNumPackets;
963  reallocImportsIfNeeded (rbufLen, debug);
964  }
965 
966  // Do we need to do communication (via doPostsAndWaits)?
967  bool needCommunication = true;
968 
969  // This may be NULL. It will be used below.
970  const this_type* srcDistObj = dynamic_cast<const this_type*> (&src);
971 
972  if (revOp == DoReverse && ! isDistributed ()) {
973  needCommunication = false;
974  }
975  // FIXME (mfh 30 Jun 2013): Checking whether the source object
976  // is distributed requires a cast to DistObject. If it's not a
977  // DistObject, then I'm not quite sure what to do. Perhaps it
978  // would be more appropriate for SrcDistObject to have an
979  // isDistributed() method. For now, I'll just assume that we
980  // need to do communication unless the cast succeeds and the
981  // source is not distributed.
982  else if (revOp == DoForward && srcDistObj != NULL &&
983  ! srcDistObj->isDistributed ()) {
984  needCommunication = false;
985  }
986 
987  // FIXME (mfh 17 Feb 2014) Distributor doesn't actually inspect
988  // the contents of the "exports" or "imports" arrays, other than
989  // to do a deep copy in the (should be technically unnecessary,
990  // but isn't for some odd reason) "self-message" case.
991  // Distributor thus doesn't need host views; it could do just
992  // fine with device views, assuming that MPI knows how to read
993  // device memory (which doesn't even require UVM).
994 
995  if (needCommunication) {
996  if (revOp == DoReverse) {
997  if (debug) {
998  std::cerr << ">>> 7.0. Reverse mode" << std::endl;
999  }
1000 
1001 #ifdef HAVE_TPETRA_TRANSFER_TIMERS
1002  Teuchos::TimeMonitor doPostsAndWaitsMon (*doPostsAndWaitsTimer_);
1003 #endif // HAVE_TPETRA_TRANSFER_TIMERS
1004  if (constantNumPackets == 0) { //variable num-packets-per-LID:
1005  if (debug) {
1006  std::cerr << ">>> 7.1. Variable # packets / LID: first comm"
1007  << std::endl;
1008  }
1009  numExportPacketsPerLID_.template sync<host_memory_space> ();
1010  numImportPacketsPerLID_.template sync<host_memory_space> ();
1011  distor.doReversePostsAndWaits (create_const_view (numExportPacketsPerLID_.template view<host_memory_space> ()),
1012  1,
1013  numImportPacketsPerLID_.template view<host_memory_space> ());
1014  size_t totalImportPackets = 0;
1015  // FIXME (mfh 17 Feb 2014) This would be a good place for
1016  // a Kokkos reduction. numImportPacketsPerLID_ has as
1017  // many entries as the number of LIDs on the calling
1018  // process.
1019  {
1020  typedef decltype (numImportPacketsPerLID_) dual_view_type;
1021  typedef typename dual_view_type::t_host host_view_type;
1022  typedef typename host_view_type::const_type const_host_view_type;
1023 
1024  const_host_view_type host_numImportPacketsPerLID =
1025  numImportPacketsPerLID_.template view<host_memory_space> ();
1026  const view_size_type numLids = host_numImportPacketsPerLID.size ();
1027  for (view_size_type i = 0; i < numLids; ++i) {
1028  totalImportPackets += host_numImportPacketsPerLID[i];
1029  }
1030  }
1031 
1032  if (debug) {
1033  std::cerr << ">>> 7.2. Realloc" << std::endl;
1034  }
1035 
1036  reallocImportsIfNeeded (totalImportPackets, debug);
1037 
1038  if (debug) {
1039  std::cerr << ">>> 7.3. Second comm" << std::endl;
1040  }
1041 
1042  if (commOnHost) {
1043  numExportPacketsPerLID_.template sync<host_memory_space> ();
1044  numImportPacketsPerLID_.template sync<host_memory_space> ();
1045  // imports_ is for output only, so we don't need to sync it.
1046  imports_.template modify<host_memory_space> ();
1047  distor.doReversePostsAndWaits (create_const_view (exports_.template view<host_memory_space> ()),
1048  getArrayViewFromDualView (numExportPacketsPerLID_),
1049  imports_.template view<host_memory_space> (),
1050  getArrayViewFromDualView (numImportPacketsPerLID_));
1051  }
1052  else {
1053  // FIXME (mfh 25 Apr 2016) Once doReversePostsAndWaits
1054  // can take numExportPacketsPerLID and
1055  // numImportPacketsPerLID as View or DualView, rather
1056  // than as Teuchos::ArrayView, then we can use their
1057  // device versions. For now, we'll use their host
1058  // versions.
1059  numExportPacketsPerLID_.template sync<host_memory_space> ();
1060  numImportPacketsPerLID_.template sync<host_memory_space> ();
1061  // imports_ is for output only, so we don't need to sync it.
1062  imports_.template modify<dev_memory_space> ();
1063  distor.doReversePostsAndWaits (create_const_view (exports_.template view<dev_memory_space> ()),
1064  getArrayViewFromDualView (numExportPacketsPerLID_),
1065  imports_.template view<dev_memory_space> (),
1066  getArrayViewFromDualView (numImportPacketsPerLID_));
1067  }
1068  }
1069  else {
1070  if (debug) {
1071  const int myRank = this->getMap ()->getComm ()->getRank ();
1072  std::ostringstream os;
1073  os << ">>> (Proc " << myRank << "): 7.1. Const # packets per "
1074  "LID: exports_.dimension_0() = " << exports_.dimension_0 ()
1075  << ", imports_.dimension_0() = " << imports_.dimension_0 ()
1076  << std::endl;
1077  std::cerr << os.str ();
1078  }
1079  if (commOnHost) {
1080  // imports_ is for output only, so we don't need to sync it.
1081  imports_.template modify<host_memory_space> ();
1082  distor.doReversePostsAndWaits (create_const_view (exports_.template view<host_memory_space> ()),
1083  constantNumPackets,
1084  imports_.template view<host_memory_space> ());
1085  }
1086  else { // pack on device
1087  // imports_ is for output only, so we don't need to sync it.
1088  imports_.template modify<dev_memory_space> ();
1089  distor.doReversePostsAndWaits (create_const_view (exports_.template view<dev_memory_space> ()),
1090  constantNumPackets,
1091  imports_.template view<dev_memory_space> ());
1092  }
1093  }
1094  }
1095  else { // revOp == DoForward
1096  if (debug) {
1097  std::cerr << ">>> 7.0. Forward mode" << std::endl;
1098  }
1099 
1100 #ifdef HAVE_TPETRA_TRANSFER_TIMERS
1101  Teuchos::TimeMonitor doPostsAndWaitsMon (*doPostsAndWaitsTimer_);
1102 #endif // HAVE_TPETRA_TRANSFER_TIMERS
1103  if (constantNumPackets == 0) { //variable num-packets-per-LID:
1104  if (debug) {
1105  std::cerr << ">>> 7.1. Variable # packets / LID: first comm" << std::endl;
1106  }
1107 
1108  numExportPacketsPerLID_.template sync<host_memory_space> ();
1109  numImportPacketsPerLID_.template sync<host_memory_space> ();
1110  distor.doPostsAndWaits (create_const_view (numExportPacketsPerLID_.template view<host_memory_space> ()), 1,
1111  numImportPacketsPerLID_.template view<host_memory_space> ());
1112  size_t totalImportPackets = 0;
1113  {
1114  typedef decltype (numImportPacketsPerLID_) dual_view_type;
1115  typedef typename dual_view_type::t_host host_view_type;
1116  typedef typename host_view_type::const_type const_host_view_type;
1117  const_host_view_type host_numImportPacketsPerLID =
1118  numImportPacketsPerLID_.template view<host_memory_space> ();
1119 
1120  // FIXME (mfh 17 Feb 2014) This would be a good place for
1121  // a Kokkos reduction. numImportPacketsPerLID_ has as
1122  // many entries as the number of LIDs on the calling
1123  // process.
1124  const view_size_type numLids = host_numImportPacketsPerLID.size ();
1125  for (view_size_type i = 0; i < numLids; ++i) {
1126  totalImportPackets += host_numImportPacketsPerLID[i];
1127  }
1128  }
1129 
1130  if (debug) {
1131  std::cerr << ">>> 7.2. Realloc" << std::endl;
1132  }
1133 
1134  reallocImportsIfNeeded (totalImportPackets, debug);
1135 
1136  if (debug) {
1137  std::cerr << ">>> 7.3. Second comm" << std::endl;
1138  }
1139 
1140  if (commOnHost) {
1141  numExportPacketsPerLID_.template sync<host_memory_space> ();
1142  numImportPacketsPerLID_.template sync<host_memory_space> ();
1143  // imports_ is for output only, so we don't need to sync it.
1144  imports_.template modify<host_memory_space> ();
1145  distor.doPostsAndWaits (create_const_view (exports_.template view<host_memory_space> ()),
1146  getArrayViewFromDualView (numExportPacketsPerLID_),
1147  imports_.template view<host_memory_space> (),
1148  getArrayViewFromDualView (numImportPacketsPerLID_));
1149  }
1150  else { // pack on device
1151  // FIXME (mfh 25 Apr 2016) Once doReversePostsAndWaits
1152  // can take numExportPacketsPerLID and
1153  // numImportPacketsPerLID as a View or DualView, rather
1154  // than as a Teuchos::ArrayView, then we can use their
1155  // device version. For now, we'll use the host version.
1156  numExportPacketsPerLID_.template sync<host_memory_space> ();
1157  numImportPacketsPerLID_.template sync<host_memory_space> ();
1158  // imports_ is for output only, so we don't need to sync it.
1159  imports_.template modify<dev_memory_space> ();
1160  distor.doPostsAndWaits (create_const_view (exports_.template view<dev_memory_space> ()),
1161  getArrayViewFromDualView (numExportPacketsPerLID_),
1162  imports_.template view<dev_memory_space> (),
1163  getArrayViewFromDualView (numImportPacketsPerLID_));
1164  }
1165  }
1166  else {
1167  if (debug) {
1168  const int myRank = this->getMap ()->getComm ()->getRank ();
1169  std::ostringstream os;
1170  os << ">>> (Proc " << myRank << "): 7.1. Const # packets per "
1171  "LID: exports_.dimension_0()=" << exports_.dimension_0 ()
1172  << ", imports_.dimension_0() = " << imports_.dimension_0 ()
1173  << std::endl;
1174  std::cerr << os.str ();
1175  }
1176 
1177  if (commOnHost) {
1178  // imports_ is for output only, so we don't need to sync it.
1179  imports_.template modify<host_memory_space> ();
1180  distor.doPostsAndWaits (create_const_view (exports_.template view<host_memory_space> ()),
1181  constantNumPackets,
1182  imports_.template view<host_memory_space> ());
1183  }
1184  else { // pack on device
1185  // imports_ is for output only, so we don't need to sync it.
1186  imports_.template modify<dev_memory_space> ();
1187  distor.doPostsAndWaits (create_const_view (exports_.template view<dev_memory_space> ()),
1188  constantNumPackets,
1189  imports_.template view<dev_memory_space> ());
1190  }
1191  }
1192  }
1193 
1194  if (debug) {
1195  std::cerr << ">>> 8. unpackAndCombineNew" << std::endl;
1196  }
1197 
1198  {
1199 #ifdef HAVE_TPETRA_TRANSFER_TIMERS
1200  Teuchos::TimeMonitor unpackAndCombineMon (*unpackAndCombineTimer_);
1201 #endif // HAVE_TPETRA_TRANSFER_TIMERS
1202 
1203  // NOTE (mfh 26 Apr 2016) We don't actually need to sync the
1204  // input DualViews, but they DO need to be most recently
1205  // updated in the same memory space.
1206  //
1207  // FIXME (mfh 26 Apr 2016) Check that all input DualViews
1208  // were most recently updated in the same memory space, and
1209  // sync them to the same place (based on commOnHost) if not.
1210  unpackAndCombineNew (remoteLIDs, imports_, numImportPacketsPerLID_,
1211  constantNumPackets, distor, CM);
1212  }
1213  }
1214  } // if (CM != ZERO)
1215 
1216  if (debug) {
1217  std::cerr << ">>> 9. Done with doTransferNew" << std::endl;
1218  }
1219  }
1220 
1221  template <class Packet, class LocalOrdinal, class GlobalOrdinal, class Node, const bool classic>
1222  void
1223  DistObject<Packet, LocalOrdinal, GlobalOrdinal, Node, classic>::
1224  print (std::ostream &os) const
1225  {
1226  using Teuchos::FancyOStream;
1227  using Teuchos::getFancyOStream;
1228  using Teuchos::RCP;
1229  using Teuchos::rcpFromRef;
1230  using std::endl;
1231 
1232  RCP<FancyOStream> out = getFancyOStream (rcpFromRef (os));
1233  this->describe (*out, Teuchos::VERB_DEFAULT);
1234  }
1235 
1236  template <class Packet, class LocalOrdinal, class GlobalOrdinal, class Node, const bool classic>
1237  void
1240  {}
1241 
1242  template <class Packet, class LocalOrdinal, class GlobalOrdinal, class Node, const bool classic>
1243  void
1245  createViewsNonConst (KokkosClassic::ReadWriteOption /*rwo*/)
1246  {}
1247 
1248  template <class Packet, class LocalOrdinal, class GlobalOrdinal, class Node, const bool classic>
1249  void
1252  {}
1253 
1254  template<class DistObjectType>
1255  void
1256  removeEmptyProcessesInPlace (Teuchos::RCP<DistObjectType>& input,
1257  const Teuchos::RCP<const Map<typename DistObjectType::local_ordinal_type,
1258  typename DistObjectType::global_ordinal_type,
1259  typename DistObjectType::node_type> >& newMap)
1260  {
1261  input->removeEmptyProcessesInPlace (newMap);
1262  if (newMap.is_null ()) { // my process is excluded
1263  input = Teuchos::null;
1264  }
1265  }
1266 
1267  template<class DistObjectType>
1268  void
1269  removeEmptyProcessesInPlace (Teuchos::RCP<DistObjectType>& input)
1270  {
1271  using Teuchos::RCP;
1272  typedef typename DistObjectType::local_ordinal_type LO;
1273  typedef typename DistObjectType::global_ordinal_type GO;
1274  typedef typename DistObjectType::node_type NT;
1275  typedef Map<LO, GO, NT> map_type;
1276 
1277  RCP<const map_type> newMap = input->getMap ()->removeEmptyProcesses ();
1278  removeEmptyProcessesInPlace<DistObjectType> (input, newMap);
1279  }
1280 
1281 // Explicit instantiation macro for general DistObject.
1282 #define TPETRA_DISTOBJECT_INSTANT(SCALAR, LO, GO, NODE) \
1283  template class DistObject< SCALAR , LO , GO , NODE >;
1284 
1285 // Explicit instantiation macro for DistObject<char, ...>.
1286 // The "SLGN" stuff above doesn't work for Packet=char.
1287 #define TPETRA_DISTOBJECT_INSTANT_CHAR(LO, GO, NODE) \
1288  template class DistObject< char , LO , GO , NODE >;
1289 
1290 } // namespace Tpetra
1291 
1292 #endif // TPETRA_DISTOBJECT_DEF_HPP
void doPostsAndWaits(const Teuchos::ArrayView< const Packet > &exports, size_t numPackets, const Teuchos::ArrayView< Packet > &imports)
Execute the (forward) communication plan.
Communication plan for data redistribution from a uniquely-owned to a (possibly) multiply-owned distr...
Namespace Tpetra contains the class and methods constituting the Tpetra library.
Teuchos::ArrayView< const LocalOrdinal > getExportLIDs() const
List of entries in the source Map that will be sent to other processes.
Teuchos::ArrayView< const LocalOrdinal > getExportLIDs() const
List of entries in the source Map that will be sent to other processes.
Distributor & getDistributor() const
The Distributor that this Export object uses to move data.
Teuchos::RCP< const map_type > getSourceMap() const
The Source Map used to construct this Import object.
size_t getNumSameIDs() const
Number of initial identical IDs.
Teuchos::ArrayView< const LocalOrdinal > getPermuteFromLIDs() const
List of local IDs in the source Map that are permuted.
void deep_copy(MultiVector< DS, DL, DG, DN, dstClassic > &dst, const MultiVector< SS, SL, SG, SN, srcClassic > &src)
Copy the contents of the MultiVector src into dst.
Teuchos::RCP< const map_type > getSourceMap() const
The source Map used to construct this Export.
void removeEmptyProcessesInPlace(Teuchos::RCP< DistObjectType > &input, const Teuchos::RCP< const Map< typename DistObjectType::local_ordinal_type, typename DistObjectType::global_ordinal_type, typename DistObjectType::node_type > > &newMap)
Remove processes which contain no elements in this object&#39;s Map.
Teuchos::ArrayView< const LocalOrdinal > getPermuteToLIDs() const
List of local IDs in the target Map that are permuted.
void doReversePostsAndWaits(const Teuchos::ArrayView< const Packet > &exports, size_t numPackets, const Teuchos::ArrayView< Packet > &imports)
Execute the reverse communication plan.
Teuchos::ArrayView< const LocalOrdinal > getPermuteFromLIDs() const
List of local IDs in the source Map that are permuted.
Insert new values that don&#39;t currently exist.
Teuchos::ArrayView< const LocalOrdinal > getPermuteToLIDs() const
List of local IDs in the target Map that are permuted.
Kokkos::DualView< T *, DT > getDualViewCopyFromArrayView(const Teuchos::ArrayView< const T > &x_av, const char label[], const bool leaveOnHost)
Get a 1-D Kokkos::DualView which is a deep copy of the input Teuchos::ArrayView (which views host mem...
Teuchos::ArrayView< const LocalOrdinal > getRemoteLIDs() const
List of entries in the target Map to receive from other processes.
Communication plan for data redistribution from a (possibly) multiply-owned to a uniquely-owned distr...
Teuchos::RCP< const map_type > getTargetMap() const
The Target Map used to construct this Import object.
Sets up and executes a communication plan for a Tpetra DistObject.
CombineMode
Rule for combining data in an Import or Export.
virtual Teuchos::RCP< const map_type > getMap() const
The Map describing the parallel distribution of this object.
Abstract base class for objects that can be the source of an Import or Export operation.
Replace existing values with new values.
Replace old values with zero.
ReverseOption
Whether the data transfer should be performed in forward or reverse mode.
DistObject(const Teuchos::RCP< const map_type > &map)
Constructor.
Describes a parallel distribution of objects over processes.
Teuchos::ArrayView< typename DualViewType::t_dev::value_type > getArrayViewFromDualView(const DualViewType &x)
Get a Teuchos::ArrayView which views the host Kokkos::View of the input 1-D Kokkos::DualView.
Teuchos::RCP< const Map< LocalOrdinal, GlobalOrdinal, Node > > removeEmptyProcesses() const
Return a new Map with processes with zero elements removed.
size_t getNumSameIDs() const
Number of initial identical IDs.
Distributor & getDistributor() const
The Distributor that this Import object uses to move data.
Teuchos::ArrayView< const LocalOrdinal > getRemoteLIDs() const
List of entries in the target Map to receive from other processes.
bool isDistributed() const
Whether this is a globally distributed object.
Base class for distributed Tpetra objects that support data redistribution.
Teuchos::RCP< const map_type > getTargetMap() const
The target Map used to construct this Export.