Tpetra parallel linear algebra  Version of the Day
Tpetra_Distributor.hpp
1 // @HEADER
2 // ***********************************************************************
3 //
4 // Tpetra: Templated Linear Algebra Services Package
5 // Copyright (2008) Sandia Corporation
6 //
7 // Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
8 // the U.S. Government retains certain rights in this software.
9 //
10 // Redistribution and use in source and binary forms, with or without
11 // modification, are permitted provided that the following conditions are
12 // met:
13 //
14 // 1. Redistributions of source code must retain the above copyright
15 // notice, this list of conditions and the following disclaimer.
16 //
17 // 2. Redistributions in binary form must reproduce the above copyright
18 // notice, this list of conditions and the following disclaimer in the
19 // documentation and/or other materials provided with the distribution.
20 //
21 // 3. Neither the name of the Corporation nor the names of the
22 // contributors may be used to endorse or promote products derived from
23 // this software without specific prior written permission.
24 //
25 // THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
26 // EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
27 // IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
28 // PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
29 // CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
30 // EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
31 // PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
32 // PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
33 // LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
34 // NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
35 // SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
36 //
37 // Questions? Contact Michael A. Heroux (maherou@sandia.gov)
38 //
39 // ************************************************************************
40 // @HEADER
41 
42 #ifndef TPETRA_DISTRIBUTOR_HPP
43 #define TPETRA_DISTRIBUTOR_HPP
44 
45 #include "Tpetra_Util.hpp"
46 #include <Teuchos_as.hpp>
47 #include <Teuchos_Describable.hpp>
48 #include <Teuchos_ParameterListAcceptorDefaultBase.hpp>
49 #include <Teuchos_VerboseObject.hpp>
50 
51 // If TPETRA_DISTRIBUTOR_TIMERS is defined, Distributor will time
52 // doPosts (both versions) and doWaits, and register those timers with
53 // Teuchos::TimeMonitor so that summarize() or report() will show
54 // results.
55 
56 // #ifndef TPETRA_DISTRIBUTOR_TIMERS
57 // # define TPETRA_DISTRIBUTOR_TIMERS 1
58 // #endif // TPETRA_DISTRIBUTOR_TIMERS
59 
60 #ifdef TPETRA_DISTRIBUTOR_TIMERS
61 # undef TPETRA_DISTRIBUTOR_TIMERS
62 #endif // TPETRA_DISTRIBUTOR_TIMERS
63 
64 #include "KokkosCompat_View.hpp"
65 #include "Kokkos_Core.hpp"
66 #include "Kokkos_TeuchosCommAdapters.hpp"
67 
68 
69 namespace Tpetra {
70 
71  namespace Details {
77  DISTRIBUTOR_ISEND, // Use MPI_Isend (Teuchos::isend)
78  DISTRIBUTOR_RSEND, // Use MPI_Rsend (Teuchos::readySend)
79  DISTRIBUTOR_SEND, // Use MPI_Send (Teuchos::send)
80  DISTRIBUTOR_SSEND // Use MPI_Ssend (Teuchos::ssend)
81  };
82 
87  std::string
89 
95  DISTRIBUTOR_NOT_INITIALIZED, // Not initialized yet
96  DISTRIBUTOR_INITIALIZED_BY_CREATE_FROM_SENDS, // By createFromSends
97  DISTRIBUTOR_INITIALIZED_BY_CREATE_FROM_RECVS, // By createFromRecvs
98  DISTRIBUTOR_INITIALIZED_BY_CREATE_FROM_SENDS_N_RECVS, // By createFromSendsAndRecvs
99  DISTRIBUTOR_INITIALIZED_BY_REVERSE, // By createReverseDistributor
100  DISTRIBUTOR_INITIALIZED_BY_COPY, // By copy constructor
101  };
102 
107  std::string
109 
110  } // namespace Details
111 
118  Teuchos::Array<std::string> distributorSendTypes ();
119 
187  class Distributor :
188  public Teuchos::Describable,
189  public Teuchos::ParameterListAcceptorDefaultBase {
190  public:
192 
193 
202  explicit Distributor (const Teuchos::RCP<const Teuchos::Comm<int> >& comm);
203 
215  Distributor (const Teuchos::RCP<const Teuchos::Comm<int> >& comm,
216  const Teuchos::RCP<Teuchos::FancyOStream>& out);
217 
231  Distributor (const Teuchos::RCP<const Teuchos::Comm<int> >& comm,
232  const Teuchos::RCP<Teuchos::ParameterList>& plist);
233 
250  Distributor (const Teuchos::RCP<const Teuchos::Comm<int> >& comm,
251  const Teuchos::RCP<Teuchos::FancyOStream>& out,
252  const Teuchos::RCP<Teuchos::ParameterList>& plist);
253 
255  Distributor (const Distributor& distributor);
256 
258  virtual ~Distributor ();
259 
265  void swap (Distributor& rhs);
266 
268 
270 
275  void setParameterList (const Teuchos::RCP<Teuchos::ParameterList>& plist);
276 
281  Teuchos::RCP<const Teuchos::ParameterList> getValidParameters () const;
282 
284 
286 
306  size_t createFromSends (const Teuchos::ArrayView<const int>& exportProcIDs);
307 
341  template <class Ordinal>
342  void
343  createFromRecvs (const Teuchos::ArrayView<const Ordinal>& remoteIDs,
344  const Teuchos::ArrayView<const int>& remoteProcIDs,
345  Teuchos::Array<Ordinal>& exportIDs,
346  Teuchos::Array<int>& exportProcIDs);
347 
355  void
356  createFromSendsAndRecvs (const Teuchos::ArrayView<const int>& exportProcIDs,
357  const Teuchos::ArrayView<const int>& remoteProcIDs);
358 
360 
362 
366  size_t getNumReceives() const;
367 
371  size_t getNumSends() const;
372 
374  bool hasSelfMessage() const;
375 
377  size_t getMaxSendLength() const;
378 
380  size_t getTotalReceiveLength() const;
381 
386  Teuchos::ArrayView<const int> getProcsFrom() const;
387 
392  Teuchos::ArrayView<const int> getProcsTo() const;
393 
401  Teuchos::ArrayView<const size_t> getLengthsFrom() const;
402 
410  Teuchos::ArrayView<const size_t> getLengthsTo() const;
411 
417  return howInitialized_;
418  }
419 
421 
423 
434  Teuchos::RCP<Distributor> getReverse() const;
435 
437 
439 
460  template <class Packet>
461  void
462  doPostsAndWaits (const Teuchos::ArrayView<const Packet> &exports,
463  size_t numPackets,
464  const Teuchos::ArrayView<Packet> &imports);
465 
487  template <class Packet>
488  void
489  doPostsAndWaits (const Teuchos::ArrayView<const Packet> &exports,
490  const Teuchos::ArrayView<const size_t>& numExportPacketsPerLID,
491  const Teuchos::ArrayView<Packet> &imports,
492  const Teuchos::ArrayView<const size_t>& numImportPacketsPerLID);
493 
518  template <class Packet>
519  void
520  doPosts (const Teuchos::ArrayRCP<const Packet> &exports,
521  size_t numPackets,
522  const Teuchos::ArrayRCP<Packet> &imports);
523 
542  template <class Packet>
543  void
544  doPosts (const Teuchos::ArrayRCP<const Packet> &exports,
545  const Teuchos::ArrayView<const size_t>& numExportPacketsPerLID,
546  const Teuchos::ArrayRCP<Packet> &imports,
547  const Teuchos::ArrayView<const size_t>& numImportPacketsPerLID);
548 
555  void doWaits ();
556 
561  template <class Packet>
562  void
563  doReversePostsAndWaits (const Teuchos::ArrayView<const Packet> &exports,
564  size_t numPackets,
565  const Teuchos::ArrayView<Packet> &imports);
566 
571  template <class Packet>
572  void
573  doReversePostsAndWaits (const Teuchos::ArrayView<const Packet> &exports,
574  const Teuchos::ArrayView<const size_t>& numExportPacketsPerLID,
575  const Teuchos::ArrayView<Packet> &imports,
576  const Teuchos::ArrayView<const size_t>& numImportPacketsPerLID);
577 
582  template <class Packet>
583  void
584  doReversePosts (const Teuchos::ArrayRCP<const Packet> &exports,
585  size_t numPackets,
586  const Teuchos::ArrayRCP<Packet> &imports);
587 
592  template <class Packet>
593  void
594  doReversePosts (const Teuchos::ArrayRCP<const Packet> &exports,
595  const Teuchos::ArrayView<const size_t>& numExportPacketsPerLID,
596  const Teuchos::ArrayRCP<Packet> &imports,
597  const Teuchos::ArrayView<const size_t>& numImportPacketsPerLID);
598 
605  void doReverseWaits ();
606 
627  template <class ExpView, class ImpView>
628  typename std::enable_if<(Kokkos::Impl::is_view<ExpView>::value && Kokkos::Impl::is_view<ImpView>::value)>::type
630  const ExpView &exports,
631  size_t numPackets,
632  const ImpView &imports);
633 
655  template <class ExpView, class ImpView>
656  typename std::enable_if<(Kokkos::Impl::is_view<ExpView>::value && Kokkos::Impl::is_view<ImpView>::value)>::type
657  doPostsAndWaits (const ExpView &exports,
658  const Teuchos::ArrayView<const size_t>& numExportPacketsPerLID,
659  const ImpView &imports,
660  const Teuchos::ArrayView<const size_t>& numImportPacketsPerLID);
661 
686  template <class ExpView, class ImpView>
687  typename std::enable_if<(Kokkos::Impl::is_view<ExpView>::value && Kokkos::Impl::is_view<ImpView>::value)>::type
688  doPosts (const ExpView &exports,
689  size_t numPackets,
690  const ImpView &imports);
691 
710  template <class ExpView, class ImpView>
711  typename std::enable_if<(Kokkos::Impl::is_view<ExpView>::value && Kokkos::Impl::is_view<ImpView>::value)>::type
712  doPosts (const ExpView &exports,
713  const Teuchos::ArrayView<const size_t>& numExportPacketsPerLID,
714  const ImpView &imports,
715  const Teuchos::ArrayView<const size_t>& numImportPacketsPerLID);
716 
721  template <class ExpView, class ImpView>
722  typename std::enable_if<(Kokkos::Impl::is_view<ExpView>::value && Kokkos::Impl::is_view<ImpView>::value)>::type
723  doReversePostsAndWaits (const ExpView &exports,
724  size_t numPackets,
725  const ImpView &imports);
726 
731  template <class ExpView, class ImpView>
732  typename std::enable_if<(Kokkos::Impl::is_view<ExpView>::value && Kokkos::Impl::is_view<ImpView>::value)>::type
733  doReversePostsAndWaits (const ExpView &exports,
734  const Teuchos::ArrayView<const size_t>& numExportPacketsPerLID,
735  const ImpView &imports,
736  const Teuchos::ArrayView<const size_t>& numImportPacketsPerLID);
737 
742  template <class ExpView, class ImpView>
743  typename std::enable_if<(Kokkos::Impl::is_view<ExpView>::value && Kokkos::Impl::is_view<ImpView>::value)>::type
744  doReversePosts (const ExpView &exports,
745  size_t numPackets,
746  const ImpView &imports);
747 
752  template <class ExpView, class ImpView>
753  typename std::enable_if<(Kokkos::Impl::is_view<ExpView>::value && Kokkos::Impl::is_view<ImpView>::value)>::type
754  doReversePosts (const ExpView &exports,
755  const Teuchos::ArrayView<const size_t>& numExportPacketsPerLID,
756  const ImpView &imports,
757  const Teuchos::ArrayView<const size_t>& numImportPacketsPerLID);
758 
762  void getLastDoStatistics(size_t & bytes_sent, size_t & bytes_recvd) const{
763  bytes_sent = lastRoundBytesSend_;
764  bytes_recvd = lastRoundBytesRecv_;
765  }
766 
768 
770 
772  std::string description() const;
773 
795  void
796  describe (Teuchos::FancyOStream& out,
797  const Teuchos::EVerbosityLevel verbLevel =
798  Teuchos::Describable::verbLevel_default) const;
800 
801  private:
803  Teuchos::RCP<const Teuchos::Comm<int> > comm_;
804 
806  Teuchos::RCP<Teuchos::FancyOStream> out_;
807 
809  Details::EDistributorHowInitialized howInitialized_;
810 
812 
813 
816 
818  bool barrierBetween_;
819 
821  bool debug_;
823 
827  bool selfMessage_;
828 
838  size_t numSends_;
839 
844  Teuchos::Array<int> procsTo_;
845 
854  Teuchos::Array<size_t> startsTo_;
855 
861  Teuchos::Array<size_t> lengthsTo_;
862 
866  size_t maxSendLength_;
867 
883  Teuchos::Array<size_t> indicesTo_;
884 
894  size_t numReceives_;
895 
902  size_t totalReceiveLength_;
903 
909  Teuchos::Array<size_t> lengthsFrom_;
910 
916  Teuchos::Array<int> procsFrom_;
917 
923  Teuchos::Array<size_t> startsFrom_;
924 
931  Teuchos::Array<size_t> indicesFrom_;
932 
939  Teuchos::Array<Teuchos::RCP<Teuchos::CommRequest<int> > > requests_;
940 
945  mutable Teuchos::RCP<Distributor> reverseDistributor_;
946 
948  size_t lastRoundBytesSend_;
949 
951  size_t lastRoundBytesRecv_;
952 
953 #ifdef TPETRA_DISTRIBUTOR_TIMERS
954  Teuchos::RCP<Teuchos::Time> timer_doPosts3_;
955  Teuchos::RCP<Teuchos::Time> timer_doPosts4_;
956  Teuchos::RCP<Teuchos::Time> timer_doWaits_;
957  Teuchos::RCP<Teuchos::Time> timer_doPosts3_recvs_;
958  Teuchos::RCP<Teuchos::Time> timer_doPosts4_recvs_;
959  Teuchos::RCP<Teuchos::Time> timer_doPosts3_barrier_;
960  Teuchos::RCP<Teuchos::Time> timer_doPosts4_barrier_;
961  Teuchos::RCP<Teuchos::Time> timer_doPosts3_sends_;
962  Teuchos::RCP<Teuchos::Time> timer_doPosts4_sends_;
963 
965  void makeTimers ();
966 #endif // TPETRA_DISTRIBUTOR_TIMERS
967 
979  bool useDistinctTags_;
980 
985  int getTag (const int pathTag) const;
986 
1004  void
1005  init (const Teuchos::RCP<const Teuchos::Comm<int> >& comm,
1006  const Teuchos::RCP<Teuchos::FancyOStream>& out,
1007  const Teuchos::RCP<Teuchos::ParameterList>& plist);
1008 
1019  void computeReceives ();
1020 
1033  template <class Ordinal>
1034  void computeSends (const Teuchos::ArrayView<const Ordinal> &remoteGIDs,
1035  const Teuchos::ArrayView<const int> &remoteProcIDs,
1036  Teuchos::Array<Ordinal> &exportGIDs,
1037  Teuchos::Array<int> &exportProcIDs);
1038 
1040  void createReverseDistributor() const;
1041 
1042 
1047  std::string
1048  localDescribeToString (const Teuchos::EVerbosityLevel vl) const;
1049  }; // class Distributor
1050 
1051 
1052  template <class Packet>
1053  void Distributor::
1054  doPostsAndWaits (const Teuchos::ArrayView<const Packet>& exports,
1055  size_t numPackets,
1056  const Teuchos::ArrayView<Packet>& imports)
1057  {
1058  using Teuchos::arcp;
1059  using Teuchos::ArrayRCP;
1060  typedef typename ArrayRCP<const Packet>::size_type size_type;
1061 
1062  TEUCHOS_TEST_FOR_EXCEPTION(
1063  requests_.size () != 0, std::runtime_error, "Tpetra::Distributor::"
1064  "doPostsAndWaits(3 args): There are " << requests_.size () <<
1065  " outstanding nonblocking messages pending. It is incorrect to call "
1066  "this method with posts outstanding.");
1067 
1068  // doPosts() accepts the exports and imports arrays as ArrayRCPs,
1069  // requiring that the memory location is persisting (as is
1070  // necessary for nonblocking receives). However, it need only
1071  // persist until doWaits() completes, so it is safe for us to use
1072  // a nonpersisting reference in this case. The use of a
1073  // nonpersisting reference is purely a performance optimization.
1074 
1075  //const Packet* exportsPtr = exports.getRawPtr();
1076  //ArrayRCP<const Packet> exportsArcp (exportsPtr, static_cast<size_type> (0),
1077  // exports.size(), false);
1078  ArrayRCP<const Packet> exportsArcp (exports.getRawPtr (),
1079  static_cast<size_type> (0),
1080  exports.size(), false);
1081 
1082  // For some reason, neither of the options below (that use arcp)
1083  // compile for Packet=std::complex<double> with GCC 4.5.1. The
1084  // issue only arises with the exports array. This is why we
1085  // construct a separate nonowning ArrayRCP.
1086 
1087  // doPosts (arcp<const Packet> (exports.getRawPtr(), 0, exports.size(), false),
1088  // numPackets,
1089  // arcp<Packet> (imports.getRawPtr(), 0, imports.size(), false));
1090  // doPosts (arcp<const Packet> (exportsPtr, 0, exports.size(), false),
1091  // numPackets,
1092  // arcp<Packet> (imports.getRawPtr(), 0, imports.size(), false));
1093  doPosts (exportsArcp,
1094  numPackets,
1095  arcp<Packet> (imports.getRawPtr (), 0, imports.size (), false));
1096  doWaits ();
1097 
1098  lastRoundBytesSend_ = exports.size () * sizeof (Packet);
1099  lastRoundBytesRecv_ = imports.size () * sizeof (Packet);
1100  }
1101 
1102  template <class Packet>
1103  void Distributor::
1104  doPostsAndWaits (const Teuchos::ArrayView<const Packet>& exports,
1105  const Teuchos::ArrayView<const size_t>& numExportPacketsPerLID,
1106  const Teuchos::ArrayView<Packet> &imports,
1107  const Teuchos::ArrayView<const size_t>& numImportPacketsPerLID)
1108  {
1109  using Teuchos::arcp;
1110  using Teuchos::ArrayRCP;
1111 
1112  TEUCHOS_TEST_FOR_EXCEPTION(
1113  requests_.size () != 0, std::runtime_error, "Tpetra::Distributor::"
1114  "doPostsAndWaits: There are " << requests_.size () << " outstanding "
1115  "nonblocking messages pending. It is incorrect to call doPostsAndWaits "
1116  "with posts outstanding.");
1117 
1118  // doPosts() accepts the exports and imports arrays as ArrayRCPs,
1119  // requiring that the memory location is persisting (as is
1120  // necessary for nonblocking receives). However, it need only
1121  // persist until doWaits() completes, so it is safe for us to use
1122  // a nonpersisting reference in this case.
1123 
1124  // mfh 04 Apr 2012: For some reason, calling arcp<const Packet>
1125  // for Packet=std::complex<T> (e.g., T=float) fails to compile
1126  // with some versions of GCC. The issue only arises with the
1127  // exports array. This is why we construct a separate nonowning
1128  // ArrayRCP.
1129  typedef typename ArrayRCP<const Packet>::size_type size_type;
1130  ArrayRCP<const Packet> exportsArcp (exports.getRawPtr (),
1131  static_cast<size_type> (0),
1132  exports.size (), false);
1133  // mfh 04 Apr 2012: This is the offending code. This statement
1134  // would normally be in place of "exportsArcp" in the
1135  // doPosts() call below.
1136  //arcp<const Packet> (exports.getRawPtr(), 0, exports.size(), false),
1137  doPosts (exportsArcp,
1138  numExportPacketsPerLID,
1139  arcp<Packet> (imports.getRawPtr (), 0, imports.size (), false),
1140  numImportPacketsPerLID);
1141  doWaits ();
1142 
1143  lastRoundBytesSend_ = exports.size () * sizeof (Packet);
1144  lastRoundBytesRecv_ = imports.size () * sizeof (Packet);
1145  }
1146 
1147 
1148  template <class Packet>
1149  void Distributor::
1150  doPosts (const Teuchos::ArrayRCP<const Packet>& exports,
1151  size_t numPackets,
1152  const Teuchos::ArrayRCP<Packet>& imports)
1153  {
1154  using Teuchos::Array;
1155  using Teuchos::ArrayRCP;
1156  using Teuchos::ArrayView;
1157  using Teuchos::as;
1158  using Teuchos::FancyOStream;
1159  using Teuchos::includesVerbLevel;
1160  using Teuchos::ireceive;
1161  using Teuchos::isend;
1162  using Teuchos::OSTab;
1163  using Teuchos::readySend;
1164  using Teuchos::send;
1165  using Teuchos::ssend;
1166  using Teuchos::TypeNameTraits;
1167  using Teuchos::typeName;
1168  using std::endl;
1169  typedef Array<size_t>::size_type size_type;
1170 
1171 #ifdef TPETRA_DISTRIBUTOR_TIMERS
1172  Teuchos::TimeMonitor timeMon (*timer_doPosts3_);
1173 #endif // TPETRA_DISTRIBUTOR_TIMERS
1174 
1175  const int myRank = comm_->getRank ();
1176  // Run-time configurable parameters that come from the input
1177  // ParameterList set by setParameterList().
1178  const Details::EDistributorSendType sendType = sendType_;
1179  const bool doBarrier = barrierBetween_;
1180 
1181  Teuchos::OSTab tab0 (out_);
1182  if (debug_) {
1183  std::ostringstream os;
1184  os << "Proc " << myRank << ": Distributor::doPosts(3 args, Teuchos::ArrayRCP)" << endl;
1185  *out_ << os.str ();
1186  }
1187  Teuchos::OSTab tab1 (out_);
1188 
1189  TEUCHOS_TEST_FOR_EXCEPTION(
1190  sendType == Details::DISTRIBUTOR_RSEND && ! doBarrier, std::logic_error,
1191  "Tpetra::Distributor::doPosts(3 args): Ready-send version requires a "
1192  "barrier between posting receives and posting ready sends. This should "
1193  "have been checked before. "
1194  "Please report this bug to the Tpetra developers.");
1195 
1196  size_t selfReceiveOffset = 0;
1197 
1198  // mfh 30 Mar 2016: See Github Issue #227 to see why we need to
1199  // check whether we're doing reverse mode before checking the
1200  // length of the imports array.
1201  if (howInitialized_ != Details::DISTRIBUTOR_INITIALIZED_BY_REVERSE) {
1202  // Each message has the same number of packets.
1203  //
1204  // FIXME (mfh 18 Jul 2014): Relaxing this test from strict
1205  // inequality to a less-than seems to have fixed Bug 6170. It's
1206  // OK for the 'imports' array to be longer than it needs to be;
1207  // I'm just curious why it would be.
1208  const size_t totalNumImportPackets = totalReceiveLength_ * numPackets;
1209  TEUCHOS_TEST_FOR_EXCEPTION
1210  (static_cast<size_t> (imports.size ()) < totalNumImportPackets,
1211  std::invalid_argument, "Tpetra::Distributor::doPosts(3 args): The "
1212  "'imports' array must have enough entries to hold the expected number "
1213  "of import packets. imports.size() = " << imports.size () << " < "
1214  "totalNumImportPackets = " << totalNumImportPackets << ".");
1215  }
1216 
1217  // MPI tag for nonblocking receives and blocking sends in this
1218  // method. Some processes might take the "fast" path
1219  // (indicesTo_.empty()) and others might take the "slow" path for
1220  // the same doPosts() call, so the path tag must be the same for
1221  // both.
1222  const int pathTag = 0;
1223  const int tag = this->getTag (pathTag);
1224 
1225 #ifdef HAVE_TPETRA_DEBUG
1226  TEUCHOS_TEST_FOR_EXCEPTION
1227  (requests_.size () != 0, std::logic_error, "Tpetra::Distributor::"
1228  "doPosts(3 args, Kokkos): Process " << myRank << ": requests_.size() = "
1229  << requests_.size () << " != 0.");
1230 #endif // HAVE_TPETRA_DEBUG
1231 
1232  // Distributor uses requests_.size() as the number of outstanding
1233  // nonblocking message requests, so we resize to zero to maintain
1234  // this invariant.
1235  //
1236  // numReceives_ does _not_ include the self message, if there is
1237  // one. Here, we do actually send a message to ourselves, so we
1238  // include any self message in the "actual" number of receives to
1239  // post.
1240  //
1241  // NOTE (mfh 19 Mar 2012): Epetra_MpiDistributor::DoPosts()
1242  // doesn't (re)allocate its array of requests. That happens in
1243  // CreateFromSends(), ComputeRecvs_(), DoReversePosts() (on
1244  // demand), or Resize_().
1245  const size_type actualNumReceives = as<size_type> (numReceives_) +
1246  as<size_type> (selfMessage_ ? 1 : 0);
1247  requests_.resize (0);
1248 
1249  if (debug_) {
1250  std::ostringstream os;
1251  os << "Proc " << myRank << ": doPosts(3,"
1252  << (indicesTo_.empty () ? "fast" : "slow") << "): Post receives"
1253  << endl;
1254  *out_ << os.str ();
1255  }
1256 
1257  // Post the nonblocking receives. It's common MPI wisdom to post
1258  // receives before sends. In MPI terms, this means favoring
1259  // adding to the "posted queue" (of receive requests) over adding
1260  // to the "unexpected queue" (of arrived messages not yet matched
1261  // with a receive).
1262  {
1263 #ifdef TPETRA_DISTRIBUTOR_TIMERS
1264  Teuchos::TimeMonitor timeMonRecvs (*timer_doPosts3_recvs_);
1265 #endif // TPETRA_DISTRIBUTOR_TIMERS
1266 
1267  size_t curBufOffset = 0;
1268  for (size_type i = 0; i < actualNumReceives; ++i) {
1269  const size_t curBufLen = lengthsFrom_[i] * numPackets;
1270  if (procsFrom_[i] != myRank) {
1271  if (debug_) {
1272  std::ostringstream os;
1273  os << "Proc " << myRank << ": doPosts(3,"
1274  << (indicesTo_.empty () ? "fast" : "slow") << "): "
1275  << "Post irecv: {source: " << procsFrom_[i]
1276  << ", tag: " << tag << "}" << endl;
1277  *out_ << os.str ();
1278  }
1279  // If my process is receiving these packet(s) from another
1280  // process (not a self-receive):
1281  //
1282  // 1. Set up the persisting view (recvBuf) of the imports
1283  // array, given the offset and size (total number of
1284  // packets from process procsFrom_[i]).
1285  // 2. Start the Irecv and save the resulting request.
1286  TEUCHOS_TEST_FOR_EXCEPTION(
1287  curBufOffset + curBufLen > static_cast<size_t> (imports.size ()),
1288  std::logic_error, "Tpetra::Distributor::doPosts(3 args, Teuchos): "
1289  "Exceeded size of 'imports' array in packing loop on Process " <<
1290  myRank << ". imports.size() = " << imports.size () << " < "
1291  "curBufOffset(" << curBufOffset << ") + curBufLen(" << curBufLen
1292  << ").");
1293  ArrayRCP<Packet> recvBuf =
1294  imports.persistingView (curBufOffset, curBufLen);
1295  requests_.push_back (ireceive<int, Packet> (recvBuf, procsFrom_[i],
1296  tag, *comm_));
1297  }
1298  else { // Receiving from myself
1299  selfReceiveOffset = curBufOffset; // Remember the self-recv offset
1300  }
1301  curBufOffset += curBufLen;
1302  }
1303  }
1304 
1305  if (doBarrier) {
1306 #ifdef TPETRA_DISTRIBUTOR_TIMERS
1307  Teuchos::TimeMonitor timeMonBarrier (*timer_doPosts3_barrier_);
1308 #endif // TPETRA_DISTRIBUTOR_TIMERS
1309 
1310  if (debug_) {
1311  std::ostringstream os;
1312  os << "Proc " << myRank << ": doPosts(3,"
1313  << (indicesTo_.empty () ? "fast" : "slow") << "): Barrier" << endl;
1314  *out_ << os.str ();
1315  }
1316  // If we are using ready sends (MPI_Rsend) below, we need to do
1317  // a barrier before we post the ready sends. This is because a
1318  // ready send requires that its matching receive has already
1319  // been posted before the send has been posted. The only way to
1320  // guarantee that in this case is to use a barrier.
1321  comm_->barrier ();
1322  }
1323 
1324 #ifdef TPETRA_DISTRIBUTOR_TIMERS
1325  Teuchos::TimeMonitor timeMonSends (*timer_doPosts3_sends_);
1326 #endif // TPETRA_DISTRIBUTOR_TIMERS
1327 
1328  // setup scan through procsTo_ list starting with higher numbered procs
1329  // (should help balance message traffic)
1330  //
1331  // FIXME (mfh 20 Feb 2013) Why haven't we precomputed this?
1332  // It doesn't depend on the input at all.
1333  size_t numBlocks = numSends_ + selfMessage_;
1334  size_t procIndex = 0;
1335  while ((procIndex < numBlocks) && (procsTo_[procIndex] < myRank)) {
1336  ++procIndex;
1337  }
1338  if (procIndex == numBlocks) {
1339  procIndex = 0;
1340  }
1341 
1342  size_t selfNum = 0;
1343  size_t selfIndex = 0;
1344 
1345  if (debug_) {
1346  std::ostringstream os;
1347  os << "Proc " << myRank << ": doPosts(3,"
1348  << (indicesTo_.empty () ? "fast" : "slow") << "): Post sends" << endl;
1349  *out_ << os.str ();
1350  }
1351 
1352  if (indicesTo_.empty()) {
1353  if (debug_) {
1354  std::ostringstream os;
1355  os << myRank << ": doPosts(3,fast): posting sends" << endl;
1356  *out_ << os.str ();
1357  }
1358 
1359  // Data are already blocked (laid out) by process, so we don't
1360  // need a separate send buffer (besides the exports array).
1361  for (size_t i = 0; i < numBlocks; ++i) {
1362  size_t p = i + procIndex;
1363  if (p > (numBlocks - 1)) {
1364  p -= numBlocks;
1365  }
1366 
1367  if (procsTo_[p] != myRank) {
1368  if (debug_) {
1369  std::ostringstream os;
1370  os << "Proc " << myRank << ": doPosts(3,fast): Post send: "
1371  "{target: " << procsTo_[p] << ", tag: " << tag << "}" << endl;
1372  *out_ << os.str ();
1373  }
1374 
1375  ArrayView<const Packet> tmpSend =
1376  exports.view (startsTo_[p]*numPackets, lengthsTo_[p]*numPackets);
1377 
1378  if (sendType == Details::DISTRIBUTOR_SEND) {
1379  send<int, Packet> (tmpSend.getRawPtr (),
1380  as<int> (tmpSend.size ()),
1381  procsTo_[p], tag, *comm_);
1382  }
1383  else if (sendType == Details::DISTRIBUTOR_ISEND) {
1384  ArrayRCP<const Packet> tmpSendBuf =
1385  exports.persistingView (startsTo_[p] * numPackets,
1386  lengthsTo_[p] * numPackets);
1387  requests_.push_back (isend<int, Packet> (tmpSendBuf, procsTo_[p],
1388  tag, *comm_));
1389  }
1390  else if (sendType == Details::DISTRIBUTOR_RSEND) {
1391  readySend<int, Packet> (tmpSend.getRawPtr (),
1392  as<int> (tmpSend.size ()),
1393  procsTo_[p], tag, *comm_);
1394  }
1395  else if (sendType == Details::DISTRIBUTOR_SSEND) {
1396  ssend<int, Packet> (tmpSend.getRawPtr (),
1397  as<int> (tmpSend.size ()),
1398  procsTo_[p], tag, *comm_);
1399  } else {
1400  TEUCHOS_TEST_FOR_EXCEPTION(
1401  true, std::logic_error, "Tpetra::Distributor::doPosts(3 args): "
1402  "Invalid send type. We should never get here. "
1403  "Please report this bug to the Tpetra developers.");
1404  }
1405  }
1406  else { // "Sending" the message to myself
1407  selfNum = p;
1408  }
1409  }
1410 
1411  if (selfMessage_) {
1412  if (debug_) {
1413  std::ostringstream os;
1414  os << "Proc " << myRank << ": doPosts(3,fast): Self-send" << endl;
1415  *out_ << os.str ();
1416  }
1417  // This is how we "send a message to ourself": we copy from
1418  // the export buffer to the import buffer. That saves
1419  // Teuchos::Comm implementations other than MpiComm (in
1420  // particular, SerialComm) the trouble of implementing self
1421  // messages correctly. (To do this right, SerialComm would
1422  // need internal buffer space for messages, keyed on the
1423  // message's tag.)
1424  std::copy (exports.begin()+startsTo_[selfNum]*numPackets,
1425  exports.begin()+startsTo_[selfNum]*numPackets+lengthsTo_[selfNum]*numPackets,
1426  imports.begin()+selfReceiveOffset);
1427  }
1428  if (debug_) {
1429  std::ostringstream os;
1430  os << myRank << ": doPosts(3,fast) done" << endl;
1431  *out_ << os.str ();
1432  }
1433  }
1434  else { // data are not blocked by proc, use send buffer
1435  if (debug_) {
1436  std::ostringstream os;
1437  os << myRank << ": doPosts(3,slow): posting sends" << endl;
1438  *out_ << os.str ();
1439  }
1440 
1441  // FIXME (mfh 05 Mar 2013) This is broken for Isend (nonblocking
1442  // sends), because the buffer is only long enough for one send.
1443  ArrayRCP<Packet> sendArray (maxSendLength_ * numPackets); // send buffer
1444 
1445  TEUCHOS_TEST_FOR_EXCEPTION(
1446  sendType == Details::DISTRIBUTOR_ISEND, std::logic_error,
1447  "Tpetra::Distributor::doPosts(3 args): The \"send buffer\" code path "
1448  << "doesn't currently work with nonblocking sends.");
1449 
1450  for (size_t i = 0; i < numBlocks; ++i) {
1451  size_t p = i + procIndex;
1452  if (p > (numBlocks - 1)) {
1453  p -= numBlocks;
1454  }
1455 
1456  if (procsTo_[p] != myRank) {
1457  if (debug_) {
1458  std::ostringstream os;
1459  os << "Proc " << myRank << ": doPosts(3,slow): Post send: "
1460  "{target: " << procsTo_[p] << ", tag: " << tag << "}" << endl;
1461  *out_ << os.str ();
1462  }
1463 
1464  typename ArrayView<const Packet>::iterator srcBegin, srcEnd;
1465  size_t sendArrayOffset = 0;
1466  size_t j = startsTo_[p];
1467  for (size_t k = 0; k < lengthsTo_[p]; ++k, ++j) {
1468  srcBegin = exports.begin() + indicesTo_[j]*numPackets;
1469  srcEnd = srcBegin + numPackets;
1470  std::copy (srcBegin, srcEnd, sendArray.begin()+sendArrayOffset);
1471  sendArrayOffset += numPackets;
1472  }
1473  ArrayView<const Packet> tmpSend =
1474  sendArray.view (0, lengthsTo_[p]*numPackets);
1475 
1476  if (sendType == Details::DISTRIBUTOR_SEND) {
1477  send<int, Packet> (tmpSend.getRawPtr (),
1478  as<int> (tmpSend.size ()),
1479  procsTo_[p], tag, *comm_);
1480  }
1481  else if (sendType == Details::DISTRIBUTOR_ISEND) {
1482  ArrayRCP<const Packet> tmpSendBuf =
1483  sendArray.persistingView (0, lengthsTo_[p] * numPackets);
1484  requests_.push_back (isend<int, Packet> (tmpSendBuf, procsTo_[p],
1485  tag, *comm_));
1486  }
1487  else if (sendType == Details::DISTRIBUTOR_RSEND) {
1488  readySend<int, Packet> (tmpSend.getRawPtr (),
1489  as<int> (tmpSend.size ()),
1490  procsTo_[p], tag, *comm_);
1491  }
1492  else if (sendType == Details::DISTRIBUTOR_SSEND) {
1493  ssend<int, Packet> (tmpSend.getRawPtr (),
1494  as<int> (tmpSend.size ()),
1495  procsTo_[p], tag, *comm_);
1496  }
1497  else {
1498  TEUCHOS_TEST_FOR_EXCEPTION(
1499  true, std::logic_error, "Tpetra::Distributor::doPosts(3 args): "
1500  "Invalid send type. We should never get here. "
1501  "Please report this bug to the Tpetra developers.");
1502  }
1503  }
1504  else { // "Sending" the message to myself
1505  selfNum = p;
1506  selfIndex = startsTo_[p];
1507  }
1508  }
1509 
1510  if (selfMessage_) {
1511  if (debug_) {
1512  std::ostringstream os;
1513  os << "Proc " << myRank << ": doPosts(3,slow): Self-send" << endl;
1514  *out_ << os.str ();
1515  }
1516  for (size_t k = 0; k < lengthsTo_[selfNum]; ++k) {
1517  std::copy (exports.begin()+indicesTo_[selfIndex]*numPackets,
1518  exports.begin()+indicesTo_[selfIndex]*numPackets + numPackets,
1519  imports.begin() + selfReceiveOffset);
1520  ++selfIndex;
1521  selfReceiveOffset += numPackets;
1522  }
1523  }
1524  if (debug_) {
1525  std::ostringstream os;
1526  os << myRank << ": doPosts(3,slow) done" << endl;
1527  *out_ << os.str ();
1528  }
1529  }
1530 
1531  if (debug_) {
1532  std::ostringstream os;
1533  os << "Proc " << myRank << ": doPosts done" << endl;
1534  *out_ << os.str ();
1535  }
1536  }
1537 
1538  template <class Packet>
1539  void Distributor::
1540  doPosts (const Teuchos::ArrayRCP<const Packet>& exports,
1541  const Teuchos::ArrayView<const size_t>& numExportPacketsPerLID,
1542  const Teuchos::ArrayRCP<Packet>& imports,
1543  const Teuchos::ArrayView<const size_t>& numImportPacketsPerLID)
1544  {
1545  using Teuchos::Array;
1546  using Teuchos::ArrayRCP;
1547  using Teuchos::ArrayView;
1548  using Teuchos::as;
1549  using Teuchos::ireceive;
1550  using Teuchos::isend;
1551  using Teuchos::readySend;
1552  using Teuchos::send;
1553  using Teuchos::ssend;
1554  using Teuchos::TypeNameTraits;
1555 #ifdef HAVE_TEUCHOS_DEBUG
1556  using Teuchos::OSTab;
1557 #endif // HAVE_TEUCHOS_DEBUG
1558  using std::endl;
1559  typedef Array<size_t>::size_type size_type;
1560 
1561  Teuchos::OSTab tab (out_);
1562 
1563 #ifdef TPETRA_DISTRIBUTOR_TIMERS
1564  Teuchos::TimeMonitor timeMon (*timer_doPosts4_);
1565 #endif // TPETRA_DISTRIBUTOR_TIMERS
1566 
1567  // Run-time configurable parameters that come from the input
1568  // ParameterList set by setParameterList().
1569  const Details::EDistributorSendType sendType = sendType_;
1570  const bool doBarrier = barrierBetween_;
1571 
1572 // #ifdef HAVE_TEUCHOS_DEBUG
1573 // // Prepare for verbose output, if applicable.
1574 // Teuchos::EVerbosityLevel verbLevel = this->getVerbLevel ();
1575 // Teuchos::RCP<Teuchos::FancyOStream> out = this->getOStream ();
1576 // const bool doPrint = out.get () && (comm_->getRank () == 0) &&
1577 // includesVerbLevel (verbLevel, Teuchos::VERB_EXTREME, true);
1578 
1579 // if (doPrint) {
1580 // // Only need one process to print out parameters.
1581 // *out << "Distributor::doPosts (4 args)" << endl;
1582 // }
1583 // // Add one tab level. We declare this outside the doPrint scopes
1584 // // so that the tab persists until the end of this method.
1585 // Teuchos::OSTab tab = this->getOSTab ();
1586 // if (doPrint) {
1587 // *out << "Parameters:" << endl;
1588 // {
1589 // OSTab tab2 (out);
1590 // *out << "sendType: " << DistributorSendTypeEnumToString (sendType)
1591 // << endl << "barrierBetween: " << doBarrier << endl;
1592 // }
1593 // }
1594 // #endif // HAVE_TEUCHOS_DEBUG
1595 
1596  TEUCHOS_TEST_FOR_EXCEPTION(
1597  sendType == Details::DISTRIBUTOR_RSEND && ! doBarrier, std::logic_error,
1598  "Tpetra::Distributor::doPosts(4 args): Ready-send version requires a "
1599  "barrier between posting receives and posting ready sends. This should "
1600  "have been checked before. "
1601  "Please report this bug to the Tpetra developers.");
1602 
1603  const int myProcID = comm_->getRank ();
1604  size_t selfReceiveOffset = 0;
1605 
1606 #ifdef HAVE_TEUCHOS_DEBUG
1607  // Different messages may have different numbers of packets.
1608  size_t totalNumImportPackets = 0;
1609  for (size_t ii = 0; ii < static_cast<size_t> (numImportPacketsPerLID.size ()); ++ii) {
1610  totalNumImportPackets += numImportPacketsPerLID[ii];
1611  }
1612  TEUCHOS_TEST_FOR_EXCEPTION(
1613  static_cast<size_t> (imports.size ()) < totalNumImportPackets,
1614  std::runtime_error, "Tpetra::Distributor::doPosts(4 args): The 'imports' "
1615  "array must have enough entries to hold the expected number of import "
1616  "packets. imports.size() = " << imports.size() << " < "
1617  "totalNumImportPackets = " << totalNumImportPackets << ".");
1618 #endif // HAVE_TEUCHOS_DEBUG
1619 
1620  // MPI tag for nonblocking receives and blocking sends in this
1621  // method. Some processes might take the "fast" path
1622  // (indicesTo_.empty()) and others might take the "slow" path for
1623  // the same doPosts() call, so the path tag must be the same for
1624  // both.
1625  const int pathTag = 1;
1626  const int tag = this->getTag (pathTag);
1627 
1628  if (debug_) {
1629  TEUCHOS_TEST_FOR_EXCEPTION(
1630  requests_.size () != 0, std::logic_error, "Tpetra::Distributor::"
1631  "doPosts(4 args): Process " << myProcID << ": requests_.size() = "
1632  << requests_.size () << " != 0.");
1633  std::ostringstream os;
1634  os << myProcID << ": doPosts(4,"
1635  << (indicesTo_.empty () ? "fast" : "slow") << ")" << endl;
1636  *out_ << os.str ();
1637  }
1638 
1639  // Distributor uses requests_.size() as the number of outstanding
1640  // nonblocking message requests, so we resize to zero to maintain
1641  // this invariant.
1642  //
1643  // numReceives_ does _not_ include the self message, if there is
1644  // one. Here, we do actually send a message to ourselves, so we
1645  // include any self message in the "actual" number of receives to
1646  // post.
1647  //
1648  // NOTE (mfh 19 Mar 2012): Epetra_MpiDistributor::DoPosts()
1649  // doesn't (re)allocate its array of requests. That happens in
1650  // CreateFromSends(), ComputeRecvs_(), DoReversePosts() (on
1651  // demand), or Resize_().
1652  const size_type actualNumReceives = as<size_type> (numReceives_) +
1653  as<size_type> (selfMessage_ ? 1 : 0);
1654  requests_.resize (0);
1655 
1656  // Post the nonblocking receives. It's common MPI wisdom to post
1657  // receives before sends. In MPI terms, this means favoring
1658  // adding to the "posted queue" (of receive requests) over adding
1659  // to the "unexpected queue" (of arrived messages not yet matched
1660  // with a receive).
1661  {
1662 #ifdef TPETRA_DISTRIBUTOR_TIMERS
1663  Teuchos::TimeMonitor timeMonRecvs (*timer_doPosts4_recvs_);
1664 #endif // TPETRA_DISTRIBUTOR_TIMERS
1665 
1666  size_t curBufferOffset = 0;
1667  size_t curLIDoffset = 0;
1668  for (size_type i = 0; i < actualNumReceives; ++i) {
1669  size_t totalPacketsFrom_i = 0;
1670  for (size_t j = 0; j < lengthsFrom_[i]; ++j) {
1671  totalPacketsFrom_i += numImportPacketsPerLID[curLIDoffset+j];
1672  }
1673  curLIDoffset += lengthsFrom_[i];
1674  if (procsFrom_[i] != myProcID && totalPacketsFrom_i) {
1675  // If my process is receiving these packet(s) from another
1676  // process (not a self-receive), and if there is at least
1677  // one packet to receive:
1678  //
1679  // 1. Set up the persisting view (recvBuf) into the imports
1680  // array, given the offset and size (total number of
1681  // packets from process procsFrom_[i]).
1682  // 2. Start the Irecv and save the resulting request.
1683  ArrayRCP<Packet> recvBuf =
1684  imports.persistingView (curBufferOffset, totalPacketsFrom_i);
1685  requests_.push_back (ireceive<int, Packet> (recvBuf, procsFrom_[i],
1686  tag, *comm_));
1687  }
1688  else { // Receiving these packet(s) from myself
1689  selfReceiveOffset = curBufferOffset; // Remember the offset
1690  }
1691  curBufferOffset += totalPacketsFrom_i;
1692  }
1693  }
1694 
1695  if (doBarrier) {
1696 #ifdef TPETRA_DISTRIBUTOR_TIMERS
1697  Teuchos::TimeMonitor timeMonBarrier (*timer_doPosts4_barrier_);
1698 #endif // TPETRA_DISTRIBUTOR_TIMERS
1699  // If we are using ready sends (MPI_Rsend) below, we need to do
1700  // a barrier before we post the ready sends. This is because a
1701  // ready send requires that its matching receive has already
1702  // been posted before the send has been posted. The only way to
1703  // guarantee that in this case is to use a barrier.
1704  comm_->barrier ();
1705  }
1706 
1707 #ifdef TPETRA_DISTRIBUTOR_TIMERS
1708  Teuchos::TimeMonitor timeMonSends (*timer_doPosts4_sends_);
1709 #endif // TPETRA_DISTRIBUTOR_TIMERS
1710 
1711  // setup arrays containing starting-offsets into exports for each send,
1712  // and num-packets-to-send for each send.
1713  Array<size_t> sendPacketOffsets(numSends_,0), packetsPerSend(numSends_,0);
1714  size_t maxNumPackets = 0;
1715  size_t curPKToffset = 0;
1716  for (size_t pp=0; pp<numSends_; ++pp) {
1717  sendPacketOffsets[pp] = curPKToffset;
1718  size_t numPackets = 0;
1719  for (size_t j=startsTo_[pp]; j<startsTo_[pp]+lengthsTo_[pp]; ++j) {
1720  numPackets += numExportPacketsPerLID[j];
1721  }
1722  if (numPackets > maxNumPackets) maxNumPackets = numPackets;
1723  packetsPerSend[pp] = numPackets;
1724  curPKToffset += numPackets;
1725  }
1726 
1727  // setup scan through procsTo_ list starting with higher numbered procs
1728  // (should help balance message traffic)
1729  size_t numBlocks = numSends_+ selfMessage_;
1730  size_t procIndex = 0;
1731  while ((procIndex < numBlocks) && (procsTo_[procIndex] < myProcID)) {
1732  ++procIndex;
1733  }
1734  if (procIndex == numBlocks) {
1735  procIndex = 0;
1736  }
1737 
1738  size_t selfNum = 0;
1739  size_t selfIndex = 0;
1740 
1741  if (indicesTo_.empty()) {
1742  if (debug_) {
1743  std::ostringstream os;
1744  os << myProcID << ": doPosts(4,fast): posting sends" << endl;
1745  *out_ << os.str ();
1746  }
1747 
1748  // Data are already blocked (laid out) by process, so we don't
1749  // need a separate send buffer (besides the exports array).
1750  for (size_t i = 0; i < numBlocks; ++i) {
1751  size_t p = i + procIndex;
1752  if (p > (numBlocks - 1)) {
1753  p -= numBlocks;
1754  }
1755 
1756  if (procsTo_[p] != myProcID && packetsPerSend[p] > 0) {
1757  ArrayView<const Packet> tmpSend =
1758  exports.view (sendPacketOffsets[p], packetsPerSend[p]);
1759 
1760  if (sendType == Details::DISTRIBUTOR_SEND) { // the default, so put it first
1761  send<int, Packet> (tmpSend.getRawPtr (),
1762  as<int> (tmpSend.size ()),
1763  procsTo_[p], tag, *comm_);
1764  }
1765  else if (sendType == Details::DISTRIBUTOR_RSEND) {
1766  readySend<int, Packet> (tmpSend.getRawPtr (),
1767  as<int> (tmpSend.size ()),
1768  procsTo_[p], tag, *comm_);
1769  }
1770  else if (sendType == Details::DISTRIBUTOR_ISEND) {
1771  ArrayRCP<const Packet> tmpSendBuf =
1772  exports.persistingView (sendPacketOffsets[p], packetsPerSend[p]);
1773  requests_.push_back (isend<int, Packet> (tmpSendBuf, procsTo_[p],
1774  tag, *comm_));
1775  }
1776  else if (sendType == Details::DISTRIBUTOR_SSEND) {
1777  ssend<int, Packet> (tmpSend.getRawPtr (),
1778  as<int> (tmpSend.size ()),
1779  procsTo_[p], tag, *comm_);
1780  }
1781  else {
1782  TEUCHOS_TEST_FOR_EXCEPTION(
1783  true, std::logic_error, "Tpetra::Distributor::doPosts(4 args): "
1784  "Invalid send type. We should never get here. Please report "
1785  "this bug to the Tpetra developers.");
1786  }
1787  }
1788  else { // "Sending" the message to myself
1789  selfNum = p;
1790  }
1791  }
1792 
1793  if (selfMessage_) {
1794  std::copy (exports.begin()+sendPacketOffsets[selfNum],
1795  exports.begin()+sendPacketOffsets[selfNum]+packetsPerSend[selfNum],
1796  imports.begin()+selfReceiveOffset);
1797  }
1798  if (debug_) {
1799  std::ostringstream os;
1800  os << myProcID << ": doPosts(4,fast) done" << endl;
1801  *out_ << os.str ();
1802  }
1803  }
1804  else { // data are not blocked by proc, use send buffer
1805  if (debug_) {
1806  std::ostringstream os;
1807  os << myProcID << ": doPosts(4,slow): posting sends" << endl;
1808  *out_ << os.str ();
1809  }
1810 
1811  // FIXME (mfh 05 Mar 2013) This may be broken for Isend.
1812  ArrayRCP<Packet> sendArray (maxNumPackets); // send buffer
1813 
1814  TEUCHOS_TEST_FOR_EXCEPTION(
1815  sendType == Details::DISTRIBUTOR_ISEND, std::logic_error,
1816  "Tpetra::Distributor::doPosts(3 args): The \"send buffer\" "
1817  "code path may not necessarily work with nonblocking sends.");
1818 
1819  Array<size_t> indicesOffsets (numExportPacketsPerLID.size(), 0);
1820  size_t ioffset = 0;
1821  for (int j=0; j<numExportPacketsPerLID.size(); ++j) {
1822  indicesOffsets[j] = ioffset;
1823  ioffset += numExportPacketsPerLID[j];
1824  }
1825 
1826  for (size_t i = 0; i < numBlocks; ++i) {
1827  size_t p = i + procIndex;
1828  if (p > (numBlocks - 1)) {
1829  p -= numBlocks;
1830  }
1831 
1832  if (procsTo_[p] != myProcID) {
1833  typename ArrayView<const Packet>::iterator srcBegin, srcEnd;
1834  size_t sendArrayOffset = 0;
1835  size_t j = startsTo_[p];
1836  size_t numPacketsTo_p = 0;
1837  for (size_t k = 0; k < lengthsTo_[p]; ++k, ++j) {
1838  srcBegin = exports.begin() + indicesOffsets[j];
1839  srcEnd = srcBegin + numExportPacketsPerLID[j];
1840  numPacketsTo_p += numExportPacketsPerLID[j];
1841  std::copy (srcBegin, srcEnd, sendArray.begin()+sendArrayOffset);
1842  sendArrayOffset += numExportPacketsPerLID[j];
1843  }
1844  if (numPacketsTo_p > 0) {
1845  ArrayView<const Packet> tmpSend =
1846  sendArray.view (0, numPacketsTo_p);
1847 
1848  if (sendType == Details::DISTRIBUTOR_RSEND) {
1849  readySend<int, Packet> (tmpSend.getRawPtr (),
1850  as<int> (tmpSend.size ()),
1851  procsTo_[p], tag, *comm_);
1852  }
1853  else if (sendType == Details::DISTRIBUTOR_ISEND) {
1854  ArrayRCP<const Packet> tmpSendBuf =
1855  sendArray.persistingView (0, numPacketsTo_p);
1856  requests_.push_back (isend<int, Packet> (tmpSendBuf, procsTo_[p],
1857  tag, *comm_));
1858  }
1859  else if (sendType == Details::DISTRIBUTOR_SSEND) {
1860  ssend<int, Packet> (tmpSend.getRawPtr (),
1861  as<int> (tmpSend.size ()),
1862  procsTo_[p], tag, *comm_);
1863  }
1864  else { // if (sendType == Details::DISTRIBUTOR_SSEND)
1865  send<int, Packet> (tmpSend.getRawPtr (),
1866  as<int> (tmpSend.size ()),
1867  procsTo_[p], tag, *comm_);
1868  }
1869  }
1870  }
1871  else { // "Sending" the message to myself
1872  selfNum = p;
1873  selfIndex = startsTo_[p];
1874  }
1875  }
1876 
1877  if (selfMessage_) {
1878  for (size_t k = 0; k < lengthsTo_[selfNum]; ++k) {
1879  std::copy (exports.begin()+indicesOffsets[selfIndex],
1880  exports.begin()+indicesOffsets[selfIndex]+numExportPacketsPerLID[selfIndex],
1881  imports.begin() + selfReceiveOffset);
1882  selfReceiveOffset += numExportPacketsPerLID[selfIndex];
1883  ++selfIndex;
1884  }
1885  }
1886  if (debug_) {
1887  std::ostringstream os;
1888  os << myProcID << ": doPosts(4,slow) done" << endl;
1889  *out_ << os.str ();
1890  }
1891  }
1892  }
1893 
1894  template <class Packet>
1895  void Distributor::
1896  doReversePostsAndWaits (const Teuchos::ArrayView<const Packet>& exports,
1897  size_t numPackets,
1898  const Teuchos::ArrayView<Packet>& imports)
1899  {
1900  using Teuchos::arcp;
1901  using Teuchos::ArrayRCP;
1902  using Teuchos::as;
1903 
1904  // doReversePosts() takes exports and imports as ArrayRCPs,
1905  // requiring that the memory locations are persisting. However,
1906  // they need only persist within the scope of that routine, so it
1907  // is safe for us to use nonpersisting references in this case.
1908 
1909  // mfh 04 Apr 2012: For some reason, calling arcp<const Packet>
1910  // for Packet=std::complex<T> (e.g., T=float) fails to compile
1911  // with some versions of GCC. The issue only arises with the
1912  // exports array. This is why we construct a separate nonowning
1913  // ArrayRCP.
1914  typedef typename ArrayRCP<const Packet>::size_type size_type;
1915  ArrayRCP<const Packet> exportsArcp (exports.getRawPtr(), as<size_type> (0),
1916  exports.size(), false);
1917  // mfh 04 Apr 2012: This is the offending code. This statement
1918  // would normally be in place of "exportsArcp" in the
1919  // doReversePosts() call below.
1920  //arcp<const Packet> (exports.getRawPtr(), 0, exports.size(), false)
1921  doReversePosts (exportsArcp,
1922  numPackets,
1923  arcp<Packet> (imports.getRawPtr (), 0, imports.size (), false));
1924  doReverseWaits ();
1925 
1926  lastRoundBytesSend_ = exports.size() * sizeof(Packet);
1927  lastRoundBytesRecv_ = imports.size() * sizeof(Packet);
1928  }
1929 
1930  template <class Packet>
1931  void Distributor::
1932  doReversePostsAndWaits (const Teuchos::ArrayView<const Packet>& exports,
1933  const Teuchos::ArrayView<const size_t>& numExportPacketsPerLID,
1934  const Teuchos::ArrayView<Packet> &imports,
1935  const Teuchos::ArrayView<const size_t>& numImportPacketsPerLID)
1936  {
1937  using Teuchos::as;
1938  using Teuchos::arcp;
1939  using Teuchos::ArrayRCP;
1940 
1941  TEUCHOS_TEST_FOR_EXCEPTION(
1942  requests_.size () != 0, std::runtime_error, "Tpetra::Distributor::"
1943  "doReversePostsAndWaits(4 args): There are " << requests_.size ()
1944  << " outstanding nonblocking messages pending. It is incorrect to call "
1945  "this method with posts outstanding.");
1946 
1947  // doReversePosts() accepts the exports and imports arrays as
1948  // ArrayRCPs, requiring that the memory location is persisting (as
1949  // is necessary for nonblocking receives). However, it need only
1950  // persist until doReverseWaits() completes, so it is safe for us
1951  // to use a nonpersisting reference in this case. The use of a
1952  // nonpersisting reference is purely a performance optimization.
1953 
1954  // mfh 02 Apr 2012: For some reason, calling arcp<const Packet>
1955  // for Packet=std::complex<double> fails to compile with some
1956  // versions of GCC. The issue only arises with the exports array.
1957  // This is why we construct a separate nonowning ArrayRCP.
1958  typedef typename ArrayRCP<const Packet>::size_type size_type;
1959  ArrayRCP<const Packet> exportsArcp (exports.getRawPtr (), as<size_type> (0),
1960  exports.size (), false);
1961  doReversePosts (exportsArcp,
1962  numExportPacketsPerLID,
1963  arcp<Packet> (imports.getRawPtr (), 0, imports.size (), false),
1964  numImportPacketsPerLID);
1965  doReverseWaits ();
1966 
1967  lastRoundBytesSend_ = exports.size() * sizeof(Packet);
1968  lastRoundBytesRecv_ = imports.size() * sizeof(Packet);
1969  }
1970 
1971  template <class Packet>
1972  void Distributor::
1973  doReversePosts (const Teuchos::ArrayRCP<const Packet>& exports,
1974  size_t numPackets,
1975  const Teuchos::ArrayRCP<Packet>& imports)
1976  {
1977  // FIXME (mfh 29 Mar 2012) WHY?
1978  TEUCHOS_TEST_FOR_EXCEPTION(
1979  ! indicesTo_.empty (), std::runtime_error,
1980  "Tpetra::Distributor::doReversePosts(3 args): Can only do reverse "
1981  "communication when original data are blocked by process.");
1982  if (reverseDistributor_.is_null ()) {
1983  createReverseDistributor ();
1984  }
1985  reverseDistributor_->doPosts (exports, numPackets, imports);
1986  }
1987 
1988  template <class Packet>
1989  void Distributor::
1990  doReversePosts (const Teuchos::ArrayRCP<const Packet>& exports,
1991  const Teuchos::ArrayView<const size_t>& numExportPacketsPerLID,
1992  const Teuchos::ArrayRCP<Packet>& imports,
1993  const Teuchos::ArrayView<const size_t>& numImportPacketsPerLID)
1994  {
1995  // FIXME (mfh 29 Mar 2012) WHY?
1996  TEUCHOS_TEST_FOR_EXCEPTION(
1997  ! indicesTo_.empty (), std::runtime_error,
1998  "Tpetra::Distributor::doReversePosts(3 args): Can only do reverse "
1999  "communication when original data are blocked by process.");
2000  if (reverseDistributor_.is_null ()) {
2001  createReverseDistributor ();
2002  }
2003  reverseDistributor_->doPosts (exports, numExportPacketsPerLID,
2004  imports, numImportPacketsPerLID);
2005  }
2006 
2007  template <class ExpView, class ImpView>
2008  typename std::enable_if<(Kokkos::Impl::is_view<ExpView>::value && Kokkos::Impl::is_view<ImpView>::value)>::type
2009  Distributor::
2010  doPostsAndWaits (const ExpView& exports,
2011  size_t numPackets,
2012  const ImpView& imports)
2013  {
2014  using Teuchos::RCP;
2015  using Teuchos::rcp;
2016  using std::endl;
2017 
2018  RCP<Teuchos::OSTab> tab0, tab1;
2019  if (debug_) {
2020  tab0 = rcp (new Teuchos::OSTab (out_));
2021  const int myRank = comm_->getRank ();
2022  std::ostringstream os;
2023  os << "Proc " << myRank
2024  << ": Distributor::doPostsAndWaits(3 args, Kokkos): "
2025  << "{sendType: " << DistributorSendTypeEnumToString (sendType_)
2026  << ", barrierBetween: " << barrierBetween_ << "}" << endl;
2027  *out_ << os.str ();
2028  tab1 = rcp (new Teuchos::OSTab (out_));
2029  }
2030 
2031  TEUCHOS_TEST_FOR_EXCEPTION(
2032  requests_.size () != 0, std::runtime_error, "Tpetra::Distributor::"
2033  "doPostsAndWaits(3 args): There are " << requests_.size () <<
2034  " outstanding nonblocking messages pending. It is incorrect to call "
2035  "this method with posts outstanding.");
2036 
2037  if (debug_) {
2038  const int myRank = comm_->getRank ();
2039  std::ostringstream os;
2040  os << "Proc " << myRank
2041  << ": Distributor::doPostsAndWaits: Call doPosts" << endl;
2042  *out_ << os.str ();
2043  }
2044  doPosts (exports, numPackets, imports);
2045  if (debug_) {
2046  const int myRank = comm_->getRank ();
2047  std::ostringstream os;
2048  os << "Proc " << myRank
2049  << ": Distributor::doPostsAndWaits: Call doWaits" << endl;
2050  *out_ << os.str ();
2051  }
2052  doWaits ();
2053  }
2054 
2055  template <class ExpView, class ImpView>
2056  typename std::enable_if<(Kokkos::Impl::is_view<ExpView>::value && Kokkos::Impl::is_view<ImpView>::value)>::type
2057  Distributor::
2058  doPostsAndWaits (const ExpView& exports,
2059  const Teuchos::ArrayView<const size_t>& numExportPacketsPerLID,
2060  const ImpView& imports,
2061  const Teuchos::ArrayView<const size_t>& numImportPacketsPerLID)
2062  {
2063  TEUCHOS_TEST_FOR_EXCEPTION(
2064  requests_.size () != 0, std::runtime_error,
2065  "Tpetra::Distributor::doPostsAndWaits(4 args): There are "
2066  << requests_.size () << " outstanding nonblocking messages pending. "
2067  "It is incorrect to call this method with posts outstanding.");
2068 
2069  doPosts (exports, numExportPacketsPerLID, imports, numImportPacketsPerLID);
2070  doWaits ();
2071  }
2072 
2073 
2074  template <class ExpView, class ImpView>
2075  typename std::enable_if<(Kokkos::Impl::is_view<ExpView>::value && Kokkos::Impl::is_view<ImpView>::value)>::type
2076  Distributor::
2077  doPosts (const ExpView &exports,
2078  size_t numPackets,
2079  const ImpView &imports)
2080  {
2081  using Teuchos::Array;
2082  using Teuchos::as;
2083  using Teuchos::FancyOStream;
2084  using Teuchos::includesVerbLevel;
2085  using Teuchos::ireceive;
2086  using Teuchos::isend;
2087  using Teuchos::OSTab;
2088  using Teuchos::readySend;
2089  using Teuchos::send;
2090  using Teuchos::ssend;
2091  using Teuchos::TypeNameTraits;
2092  using Teuchos::typeName;
2093  using std::endl;
2094  using Kokkos::Compat::create_const_view;
2095  using Kokkos::Compat::create_view;
2096  using Kokkos::Compat::subview_offset;
2097  using Kokkos::Compat::deep_copy_offset;
2098  typedef Array<size_t>::size_type size_type;
2099  typedef ExpView exports_view_type;
2100  typedef ImpView imports_view_type;
2101 
2102 #ifdef TPETRA_DISTRIBUTOR_TIMERS
2103  Teuchos::TimeMonitor timeMon (*timer_doPosts3_);
2104 #endif // TPETRA_DISTRIBUTOR_TIMERS
2105 
2106  const int myRank = comm_->getRank ();
2107  // Run-time configurable parameters that come from the input
2108  // ParameterList set by setParameterList().
2109  const Details::EDistributorSendType sendType = sendType_;
2110  const bool doBarrier = barrierBetween_;
2111 
2112  Teuchos::OSTab tab0 (out_);
2113  if (debug_) {
2114  std::ostringstream os;
2115  os << "Proc " << myRank << ": Distributor::doPosts(3 args, Kokkos)" << endl;
2116  *out_ << os.str ();
2117  }
2118  Teuchos::OSTab tab1 (out_);
2119 
2120  TEUCHOS_TEST_FOR_EXCEPTION(
2121  sendType == Details::DISTRIBUTOR_RSEND && ! doBarrier, std::logic_error,
2122  "Tpetra::Distributor::doPosts(3 args): Ready-send version requires a "
2123  "barrier between posting receives and posting ready sends. This should "
2124  "have been checked before. "
2125  "Please report this bug to the Tpetra developers.");
2126 
2127  size_t selfReceiveOffset = 0;
2128 
2129  // mfh 30 Mar 2016: See Github Issue #227 to see why we need to
2130  // check whether we're doing reverse mode before checking the
2131  // length of the imports array.
2132  if (false /* howInitialized_ != Details::DISTRIBUTOR_INITIALIZED_BY_REVERSE */) {
2133  // Each message has the same number of packets.
2134  const size_t totalNumImportPackets = totalReceiveLength_ * numPackets;
2135 
2136  if (debug_) {
2137  std::ostringstream os;
2138  os << "Proc " << myRank << ": doPosts: totalNumImportPackets = " <<
2139  totalNumImportPackets << " = " << totalReceiveLength_ << " * " <<
2140  numPackets << "; imports.dimension_0() = " << imports.dimension_0 ()
2141  << endl;
2142  *out_ << os.str ();
2143  }
2144 
2145 #ifdef HAVE_TPETRA_DEBUG
2146  // mfh 31 Mar 2016: Extra special all-reduce check to help diagnose #227.
2147  {
2148  const size_t importBufSize = static_cast<size_t> (imports.dimension_0 ());
2149  const int lclBad = (importBufSize < totalNumImportPackets) ? 1 : 0;
2150  int gblBad = 0;
2151  using Teuchos::reduceAll;
2152  using Teuchos::REDUCE_MAX;
2153  using Teuchos::outArg;
2154  reduceAll (*comm_, REDUCE_MAX, lclBad, outArg (gblBad));
2155  TEUCHOS_TEST_FOR_EXCEPTION
2156  (gblBad != 0, std::runtime_error, "Tpetra::Distributor::doPosts(3 "
2157  "args, Kokkos): On one or more MPI processes, the 'imports' array "
2158  "does not have enough entries to hold the expected number of "
2159  "import packets. ");
2160  }
2161 #else
2162  TEUCHOS_TEST_FOR_EXCEPTION
2163  (static_cast<size_t> (imports.dimension_0 ()) < totalNumImportPackets,
2164  std::runtime_error, "Tpetra::Distributor::doPosts(3 args): The 'imports' "
2165  "array must have enough entries to hold the expected number of import "
2166  "packets. imports.dimension_0() = " << imports.dimension_0 () << " < "
2167  "totalNumImportPackets = " << totalNumImportPackets << " = "
2168  "totalReceiveLength_ (" << totalReceiveLength_ << ") * numPackets ("
2169  << numPackets << ").");
2170 #endif // HAVE_TPETRA_DEBUG
2171  }
2172 
2173  // MPI tag for nonblocking receives and blocking sends in this
2174  // method. Some processes might take the "fast" path
2175  // (indicesTo_.empty()) and others might take the "slow" path for
2176  // the same doPosts() call, so the path tag must be the same for
2177  // both.
2178  const int pathTag = 0;
2179  const int tag = this->getTag (pathTag);
2180 
2181 #ifdef HAVE_TPETRA_DEBUG
2182  TEUCHOS_TEST_FOR_EXCEPTION
2183  (requests_.size () != 0, std::logic_error, "Tpetra::Distributor::"
2184  "doPosts(3 args, Kokkos): Process " << myRank << ": requests_.size() = "
2185  << requests_.size () << " != 0.");
2186 #endif // HAVE_TPETRA_DEBUG
2187 
2188  // Distributor uses requests_.size() as the number of outstanding
2189  // nonblocking message requests, so we resize to zero to maintain
2190  // this invariant.
2191  //
2192  // numReceives_ does _not_ include the self message, if there is
2193  // one. Here, we do actually send a message to ourselves, so we
2194  // include any self message in the "actual" number of receives to
2195  // post.
2196  //
2197  // NOTE (mfh 19 Mar 2012): Epetra_MpiDistributor::DoPosts()
2198  // doesn't (re)allocate its array of requests. That happens in
2199  // CreateFromSends(), ComputeRecvs_(), DoReversePosts() (on
2200  // demand), or Resize_().
2201  const size_type actualNumReceives = as<size_type> (numReceives_) +
2202  as<size_type> (selfMessage_ ? 1 : 0);
2203  requests_.resize (0);
2204 
2205  if (debug_) {
2206  std::ostringstream os;
2207  os << "Proc " << myRank << ": doPosts(3,"
2208  << (indicesTo_.empty () ? "fast" : "slow") << "): Post receives"
2209  << endl;
2210  *out_ << os.str ();
2211  }
2212 
2213  // Post the nonblocking receives. It's common MPI wisdom to post
2214  // receives before sends. In MPI terms, this means favoring
2215  // adding to the "posted queue" (of receive requests) over adding
2216  // to the "unexpected queue" (of arrived messages not yet matched
2217  // with a receive).
2218  {
2219 #ifdef TPETRA_DISTRIBUTOR_TIMERS
2220  Teuchos::TimeMonitor timeMonRecvs (*timer_doPosts3_recvs_);
2221 #endif // TPETRA_DISTRIBUTOR_TIMERS
2222 
2223  size_t curBufferOffset = 0;
2224  for (size_type i = 0; i < actualNumReceives; ++i) {
2225  const size_t curBufLen = lengthsFrom_[i] * numPackets;
2226  if (procsFrom_[i] != myRank) {
2227  if (debug_) {
2228  std::ostringstream os;
2229  os << "Proc " << myRank << ": doPosts(3,"
2230  << (indicesTo_.empty () ? "fast" : "slow") << "): "
2231  << "Post irecv: {source: " << procsFrom_[i]
2232  << ", tag: " << tag << "}" << endl;
2233  *out_ << os.str ();
2234  }
2235  // If my process is receiving these packet(s) from another
2236  // process (not a self-receive):
2237  //
2238  // 1. Set up the persisting view (recvBuf) of the imports
2239  // array, given the offset and size (total number of
2240  // packets from process procsFrom_[i]).
2241  // 2. Start the Irecv and save the resulting request.
2242  TEUCHOS_TEST_FOR_EXCEPTION(
2243  curBufferOffset + curBufLen > static_cast<size_t> (imports.size ()),
2244  std::logic_error, "Tpetra::Distributor::doPosts(3 args, Kokkos): "
2245  "Exceeded size of 'imports' array in packing loop on Process " <<
2246  myRank << ". imports.size() = " << imports.size () << " < "
2247  "curBufferOffset(" << curBufferOffset << ") + curBufLen(" <<
2248  curBufLen << ").");
2249  imports_view_type recvBuf =
2250  subview_offset (imports, curBufferOffset, curBufLen);
2251  requests_.push_back (ireceive<int> (recvBuf, procsFrom_[i],
2252  tag, *comm_));
2253  }
2254  else { // Receiving from myself
2255  selfReceiveOffset = curBufferOffset; // Remember the self-recv offset
2256  }
2257  curBufferOffset += curBufLen;
2258  }
2259  }
2260 
2261  if (doBarrier) {
2262 #ifdef TPETRA_DISTRIBUTOR_TIMERS
2263  Teuchos::TimeMonitor timeMonBarrier (*timer_doPosts3_barrier_);
2264 #endif // TPETRA_DISTRIBUTOR_TIMERS
2265 
2266  if (debug_) {
2267  std::ostringstream os;
2268  os << "Proc " << myRank << ": doPosts(3,"
2269  << (indicesTo_.empty () ? "fast" : "slow") << "): Barrier" << endl;
2270  *out_ << os.str ();
2271  }
2272  // If we are using ready sends (MPI_Rsend) below, we need to do
2273  // a barrier before we post the ready sends. This is because a
2274  // ready send requires that its matching receive has already
2275  // been posted before the send has been posted. The only way to
2276  // guarantee that in this case is to use a barrier.
2277  comm_->barrier ();
2278  }
2279 
2280 #ifdef TPETRA_DISTRIBUTOR_TIMERS
2281  Teuchos::TimeMonitor timeMonSends (*timer_doPosts3_sends_);
2282 #endif // TPETRA_DISTRIBUTOR_TIMERS
2283 
2284  // setup scan through procsTo_ list starting with higher numbered procs
2285  // (should help balance message traffic)
2286  //
2287  // FIXME (mfh 20 Feb 2013) Why haven't we precomputed this?
2288  // It doesn't depend on the input at all.
2289  size_t numBlocks = numSends_ + selfMessage_;
2290  size_t procIndex = 0;
2291  while ((procIndex < numBlocks) && (procsTo_[procIndex] < myRank)) {
2292  ++procIndex;
2293  }
2294  if (procIndex == numBlocks) {
2295  procIndex = 0;
2296  }
2297 
2298  size_t selfNum = 0;
2299  size_t selfIndex = 0;
2300 
2301  if (debug_) {
2302  std::ostringstream os;
2303  os << "Proc " << myRank << ": doPosts(3,"
2304  << (indicesTo_.empty () ? "fast" : "slow") << "): Post sends" << endl;
2305  *out_ << os.str ();
2306  }
2307 
2308  if (indicesTo_.empty()) {
2309  if (debug_) {
2310  std::ostringstream os;
2311  os << "Proc " << myRank << ": doPosts(3,fast): posting sends" << endl;
2312  *out_ << os.str ();
2313  }
2314 
2315  // Data are already blocked (laid out) by process, so we don't
2316  // need a separate send buffer (besides the exports array).
2317  for (size_t i = 0; i < numBlocks; ++i) {
2318  size_t p = i + procIndex;
2319  if (p > (numBlocks - 1)) {
2320  p -= numBlocks;
2321  }
2322 
2323  if (procsTo_[p] != myRank) {
2324  if (debug_) {
2325  std::ostringstream os;
2326  os << "Proc " << myRank << ": doPosts(3,fast): Post send: "
2327  "{target: " << procsTo_[p] << ", tag: " << tag << "}" << endl;
2328  *out_ << os.str ();
2329  }
2330  // if (debug_) {
2331  // const size_t off = startsTo_[p] * numPackets;
2332  // const size_t len = lengthsTo_[p] * numPackets;
2333  // TEUCHOS_TEST_FOR_EXCEPTION
2334  // (static_cast<size_t> (off + len) >
2335  // static_cast<size_t> (exports.size ()), std::logic_error,
2336  // "doPosts: off=" << off << " + len=" << len << " > "
2337  // "exports.size()=" << exports.size () << ".");
2338  // }
2339 
2340  exports_view_type tmpSend = subview_offset(
2341  exports, startsTo_[p]*numPackets, lengthsTo_[p]*numPackets);
2342 
2343  if (sendType == Details::DISTRIBUTOR_SEND) {
2344  send<int> (tmpSend,
2345  as<int> (tmpSend.size ()),
2346  procsTo_[p], tag, *comm_);
2347  }
2348  else if (sendType == Details::DISTRIBUTOR_ISEND) {
2349  exports_view_type tmpSendBuf =
2350  subview_offset (exports, startsTo_[p] * numPackets,
2351  lengthsTo_[p] * numPackets);
2352  requests_.push_back (isend<int> (tmpSendBuf, procsTo_[p],
2353  tag, *comm_));
2354  }
2355  else if (sendType == Details::DISTRIBUTOR_RSEND) {
2356  readySend<int> (tmpSend,
2357  as<int> (tmpSend.size ()),
2358  procsTo_[p], tag, *comm_);
2359  }
2360  else if (sendType == Details::DISTRIBUTOR_SSEND) {
2361  ssend<int> (tmpSend,
2362  as<int> (tmpSend.size ()),
2363  procsTo_[p], tag, *comm_);
2364  } else {
2365  TEUCHOS_TEST_FOR_EXCEPTION(
2366  true, std::logic_error, "Tpetra::Distributor::doPosts(3 args): "
2367  "Invalid send type. We should never get here. "
2368  "Please report this bug to the Tpetra developers.");
2369  }
2370  }
2371  else { // "Sending" the message to myself
2372  selfNum = p;
2373  }
2374  }
2375 
2376  if (selfMessage_) {
2377  if (debug_) {
2378  std::ostringstream os;
2379  os << "Proc " << myRank << ": doPosts(3,fast): Self-send" << endl;
2380  *out_ << os.str ();
2381  }
2382  // This is how we "send a message to ourself": we copy from
2383  // the export buffer to the import buffer. That saves
2384  // Teuchos::Comm implementations other than MpiComm (in
2385  // particular, SerialComm) the trouble of implementing self
2386  // messages correctly. (To do this right, SerialComm would
2387  // need internal buffer space for messages, keyed on the
2388  // message's tag.)
2389  deep_copy_offset(imports, exports, selfReceiveOffset,
2390  startsTo_[selfNum]*numPackets,
2391  lengthsTo_[selfNum]*numPackets);
2392  }
2393  if (debug_) {
2394  std::ostringstream os;
2395  os << "Proc " << myRank << ": doPosts(3,fast) done" << endl;
2396  *out_ << os.str ();
2397  }
2398  }
2399  else { // data are not blocked by proc, use send buffer
2400  if (debug_) {
2401  std::ostringstream os;
2402  os << "Proc " << myRank << ": doPosts(3,slow): posting sends" << endl;
2403  *out_ << os.str ();
2404  }
2405 
2406  typedef typename ExpView::non_const_value_type Packet;
2407  typedef typename ExpView::array_layout Layout;
2408  typedef typename ExpView::device_type Device;
2409  typedef typename ExpView::memory_traits Mem;
2410  Kokkos::View<Packet*,Layout,Device,Mem> sendArray ("sendArray",
2411  maxSendLength_ * numPackets);
2412 
2413  // FIXME (mfh 05 Mar 2013) This is broken for Isend (nonblocking
2414  // sends), because the buffer is only long enough for one send.
2415  TEUCHOS_TEST_FOR_EXCEPTION(
2416  sendType == Details::DISTRIBUTOR_ISEND, std::logic_error,
2417  "Tpetra::Distributor::doPosts(3 args): The \"send buffer\" code path "
2418  "doesn't currently work with nonblocking sends.");
2419 
2420  for (size_t i = 0; i < numBlocks; ++i) {
2421  size_t p = i + procIndex;
2422  if (p > (numBlocks - 1)) {
2423  p -= numBlocks;
2424  }
2425 
2426  if (procsTo_[p] != myRank) {
2427  if (debug_) {
2428  std::ostringstream os;
2429  os << "Proc " << myRank << ": doPosts(3,slow): Post send: "
2430  "{target: " << procsTo_[p] << ", tag: " << tag << "}" << endl;
2431  *out_ << os.str ();
2432  }
2433 
2434  size_t sendArrayOffset = 0;
2435  size_t j = startsTo_[p];
2436  for (size_t k = 0; k < lengthsTo_[p]; ++k, ++j) {
2437  deep_copy_offset(sendArray, exports, sendArrayOffset,
2438  indicesTo_[j]*numPackets, numPackets);
2439  sendArrayOffset += numPackets;
2440  }
2441  ImpView tmpSend =
2442  subview_offset(sendArray, size_t(0), lengthsTo_[p]*numPackets);
2443 
2444  if (sendType == Details::DISTRIBUTOR_SEND) {
2445  send<int> (tmpSend,
2446  as<int> (tmpSend.size ()),
2447  procsTo_[p], tag, *comm_);
2448  }
2449  else if (sendType == Details::DISTRIBUTOR_ISEND) {
2450  exports_view_type tmpSendBuf =
2451  subview_offset (sendArray, size_t(0), lengthsTo_[p] * numPackets);
2452  requests_.push_back (isend<int> (tmpSendBuf, procsTo_[p],
2453  tag, *comm_));
2454  }
2455  else if (sendType == Details::DISTRIBUTOR_RSEND) {
2456  readySend<int> (tmpSend,
2457  as<int> (tmpSend.size ()),
2458  procsTo_[p], tag, *comm_);
2459  }
2460  else if (sendType == Details::DISTRIBUTOR_SSEND) {
2461  ssend<int> (tmpSend,
2462  as<int> (tmpSend.size ()),
2463  procsTo_[p], tag, *comm_);
2464  }
2465  else {
2466  TEUCHOS_TEST_FOR_EXCEPTION(
2467  true, std::logic_error, "Tpetra::Distributor::doPosts(3 args): "
2468  "Invalid send type. We should never get here. "
2469  "Please report this bug to the Tpetra developers.");
2470  }
2471  }
2472  else { // "Sending" the message to myself
2473  selfNum = p;
2474  selfIndex = startsTo_[p];
2475  }
2476  }
2477 
2478  if (selfMessage_) {
2479  if (debug_) {
2480  std::ostringstream os;
2481  os << "Proc " << myRank << ": doPosts(3,slow): Self-send" << endl;
2482  *out_ << os.str ();
2483  }
2484  for (size_t k = 0; k < lengthsTo_[selfNum]; ++k) {
2485  deep_copy_offset(imports, exports, selfReceiveOffset,
2486  indicesTo_[selfIndex]*numPackets, numPackets);
2487  ++selfIndex;
2488  selfReceiveOffset += numPackets;
2489  }
2490  }
2491  if (debug_) {
2492  std::ostringstream os;
2493  os << "Proc " << myRank << ": doPosts(3,slow) done" << endl;
2494  *out_ << os.str ();
2495  }
2496  }
2497 
2498  if (debug_) {
2499  std::ostringstream os;
2500  os << "Proc " << myRank << ": doPosts done" << endl;
2501  *out_ << os.str ();
2502  }
2503  }
2504 
2505  template <class ExpView, class ImpView>
2506  typename std::enable_if<(Kokkos::Impl::is_view<ExpView>::value && Kokkos::Impl::is_view<ImpView>::value)>::type
2507  Distributor::
2508  doPosts (const ExpView &exports,
2509  const Teuchos::ArrayView<const size_t>& numExportPacketsPerLID,
2510  const ImpView &imports,
2511  const Teuchos::ArrayView<const size_t>& numImportPacketsPerLID)
2512  {
2513  using Teuchos::Array;
2514  using Teuchos::as;
2515  using Teuchos::ireceive;
2516  using Teuchos::isend;
2517  using Teuchos::readySend;
2518  using Teuchos::send;
2519  using Teuchos::ssend;
2520  using Teuchos::TypeNameTraits;
2521 #ifdef HAVE_TEUCHOS_DEBUG
2522  using Teuchos::OSTab;
2523 #endif // HAVE_TEUCHOS_DEBUG
2524  using std::endl;
2525  using Kokkos::Compat::create_const_view;
2526  using Kokkos::Compat::create_view;
2527  using Kokkos::Compat::subview_offset;
2528  using Kokkos::Compat::deep_copy_offset;
2529  typedef Array<size_t>::size_type size_type;
2530  typedef ExpView exports_view_type;
2531  typedef ImpView imports_view_type;
2532 
2533  Teuchos::OSTab tab (out_);
2534 
2535 #ifdef TPETRA_DISTRIBUTOR_TIMERS
2536  Teuchos::TimeMonitor timeMon (*timer_doPosts4_);
2537 #endif // TPETRA_DISTRIBUTOR_TIMERS
2538 
2539  // Run-time configurable parameters that come from the input
2540  // ParameterList set by setParameterList().
2541  const Details::EDistributorSendType sendType = sendType_;
2542  const bool doBarrier = barrierBetween_;
2543 
2544 // #ifdef HAVE_TEUCHOS_DEBUG
2545 // // Prepare for verbose output, if applicable.
2546 // Teuchos::EVerbosityLevel verbLevel = this->getVerbLevel ();
2547 // RCP<Teuchos::FancyOStream> out = this->getOStream ();
2548 // const bool doPrint = out.get () && (comm_->getRank () == 0) &&
2549 // includesVerbLevel (verbLevel, Teuchos::VERB_EXTREME, true);
2550 
2551 // if (doPrint) {
2552 // // Only need one process to print out parameters.
2553 // *out << "Distributor::doPosts (4 args)" << endl;
2554 // }
2555 // // Add one tab level. We declare this outside the doPrint scopes
2556 // // so that the tab persists until the end of this method.
2557 // Teuchos::OSTab tab = this->getOSTab ();
2558 // if (doPrint) {
2559 // *out << "Parameters:" << endl;
2560 // {
2561 // OSTab tab2 (out);
2562 // *out << "sendType: " << DistributorSendTypeEnumToString (sendType)
2563 // << endl << "barrierBetween: " << doBarrier << endl;
2564 // }
2565 // }
2566 // #endif // HAVE_TEUCHOS_DEBUG
2567 
2568  TEUCHOS_TEST_FOR_EXCEPTION(
2569  sendType == Details::DISTRIBUTOR_RSEND && ! doBarrier,
2570  std::logic_error, "Tpetra::Distributor::doPosts(4 args): Ready-send "
2571  "version requires a barrier between posting receives and posting ready "
2572  "sends. This should have been checked before. "
2573  "Please report this bug to the Tpetra developers.");
2574 
2575  const int myProcID = comm_->getRank ();
2576  size_t selfReceiveOffset = 0;
2577 
2578 #ifdef HAVE_TEUCHOS_DEBUG
2579  // Different messages may have different numbers of packets.
2580  size_t totalNumImportPackets = 0;
2581  for (size_type ii = 0; ii < numImportPacketsPerLID.size (); ++ii) {
2582  totalNumImportPackets += numImportPacketsPerLID[ii];
2583  }
2584  TEUCHOS_TEST_FOR_EXCEPTION(
2585  imports.dimension_0 () < totalNumImportPackets, std::runtime_error,
2586  "Tpetra::Distributor::doPosts(4 args): The 'imports' array must have "
2587  "enough entries to hold the expected number of import packets. "
2588  "imports.dimension_0() = " << imports.dimension_0 () << " < "
2589  "totalNumImportPackets = " << totalNumImportPackets << ".");
2590 #endif // HAVE_TEUCHOS_DEBUG
2591 
2592  // MPI tag for nonblocking receives and blocking sends in this
2593  // method. Some processes might take the "fast" path
2594  // (indicesTo_.empty()) and others might take the "slow" path for
2595  // the same doPosts() call, so the path tag must be the same for
2596  // both.
2597  const int pathTag = 1;
2598  const int tag = this->getTag (pathTag);
2599 
2600  if (debug_) {
2601  TEUCHOS_TEST_FOR_EXCEPTION(
2602  requests_.size () != 0, std::logic_error, "Tpetra::Distributor::"
2603  "doPosts(4 args): Process " << myProcID << ": requests_.size () = "
2604  << requests_.size () << " != 0.");
2605  std::ostringstream os;
2606  os << myProcID << ": doPosts(4,"
2607  << (indicesTo_.empty () ? "fast" : "slow") << ")" << endl;
2608  *out_ << os.str ();
2609  }
2610 
2611  // Distributor uses requests_.size() as the number of outstanding
2612  // nonblocking message requests, so we resize to zero to maintain
2613  // this invariant.
2614  //
2615  // numReceives_ does _not_ include the self message, if there is
2616  // one. Here, we do actually send a message to ourselves, so we
2617  // include any self message in the "actual" number of receives to
2618  // post.
2619  //
2620  // NOTE (mfh 19 Mar 2012): Epetra_MpiDistributor::DoPosts()
2621  // doesn't (re)allocate its array of requests. That happens in
2622  // CreateFromSends(), ComputeRecvs_(), DoReversePosts() (on
2623  // demand), or Resize_().
2624  const size_type actualNumReceives = as<size_type> (numReceives_) +
2625  as<size_type> (selfMessage_ ? 1 : 0);
2626  requests_.resize (0);
2627 
2628  // Post the nonblocking receives. It's common MPI wisdom to post
2629  // receives before sends. In MPI terms, this means favoring
2630  // adding to the "posted queue" (of receive requests) over adding
2631  // to the "unexpected queue" (of arrived messages not yet matched
2632  // with a receive).
2633  {
2634 #ifdef TPETRA_DISTRIBUTOR_TIMERS
2635  Teuchos::TimeMonitor timeMonRecvs (*timer_doPosts4_recvs_);
2636 #endif // TPETRA_DISTRIBUTOR_TIMERS
2637 
2638  size_t curBufferOffset = 0;
2639  size_t curLIDoffset = 0;
2640  for (size_type i = 0; i < actualNumReceives; ++i) {
2641  size_t totalPacketsFrom_i = 0;
2642  for (size_t j = 0; j < lengthsFrom_[i]; ++j) {
2643  totalPacketsFrom_i += numImportPacketsPerLID[curLIDoffset+j];
2644  }
2645  curLIDoffset += lengthsFrom_[i];
2646  if (procsFrom_[i] != myProcID && totalPacketsFrom_i) {
2647  // If my process is receiving these packet(s) from another
2648  // process (not a self-receive), and if there is at least
2649  // one packet to receive:
2650  //
2651  // 1. Set up the persisting view (recvBuf) into the imports
2652  // array, given the offset and size (total number of
2653  // packets from process procsFrom_[i]).
2654  // 2. Start the Irecv and save the resulting request.
2655  imports_view_type recvBuf =
2656  subview_offset (imports, curBufferOffset, totalPacketsFrom_i);
2657  requests_.push_back (ireceive<int> (recvBuf, procsFrom_[i],
2658  tag, *comm_));
2659  }
2660  else { // Receiving these packet(s) from myself
2661  selfReceiveOffset = curBufferOffset; // Remember the offset
2662  }
2663  curBufferOffset += totalPacketsFrom_i;
2664  }
2665  }
2666 
2667  if (doBarrier) {
2668 #ifdef TPETRA_DISTRIBUTOR_TIMERS
2669  Teuchos::TimeMonitor timeMonBarrier (*timer_doPosts4_barrier_);
2670 #endif // TPETRA_DISTRIBUTOR_TIMERS
2671  // If we are using ready sends (MPI_Rsend) below, we need to do
2672  // a barrier before we post the ready sends. This is because a
2673  // ready send requires that its matching receive has already
2674  // been posted before the send has been posted. The only way to
2675  // guarantee that in this case is to use a barrier.
2676  comm_->barrier ();
2677  }
2678 
2679 #ifdef TPETRA_DISTRIBUTOR_TIMERS
2680  Teuchos::TimeMonitor timeMonSends (*timer_doPosts4_sends_);
2681 #endif // TPETRA_DISTRIBUTOR_TIMERS
2682 
2683  // setup arrays containing starting-offsets into exports for each send,
2684  // and num-packets-to-send for each send.
2685  Array<size_t> sendPacketOffsets(numSends_,0), packetsPerSend(numSends_,0);
2686  size_t maxNumPackets = 0;
2687  size_t curPKToffset = 0;
2688  for (size_t pp=0; pp<numSends_; ++pp) {
2689  sendPacketOffsets[pp] = curPKToffset;
2690  size_t numPackets = 0;
2691  for (size_t j=startsTo_[pp]; j<startsTo_[pp]+lengthsTo_[pp]; ++j) {
2692  numPackets += numExportPacketsPerLID[j];
2693  }
2694  if (numPackets > maxNumPackets) maxNumPackets = numPackets;
2695  packetsPerSend[pp] = numPackets;
2696  curPKToffset += numPackets;
2697  }
2698 
2699  // setup scan through procsTo_ list starting with higher numbered procs
2700  // (should help balance message traffic)
2701  size_t numBlocks = numSends_+ selfMessage_;
2702  size_t procIndex = 0;
2703  while ((procIndex < numBlocks) && (procsTo_[procIndex] < myProcID)) {
2704  ++procIndex;
2705  }
2706  if (procIndex == numBlocks) {
2707  procIndex = 0;
2708  }
2709 
2710  size_t selfNum = 0;
2711  size_t selfIndex = 0;
2712 
2713  if (indicesTo_.empty()) {
2714  if (debug_) {
2715  std::ostringstream os;
2716  os << myProcID << ": doPosts(4,fast): posting sends" << endl;
2717  *out_ << os.str ();
2718  }
2719 
2720  // Data are already blocked (laid out) by process, so we don't
2721  // need a separate send buffer (besides the exports array).
2722  for (size_t i = 0; i < numBlocks; ++i) {
2723  size_t p = i + procIndex;
2724  if (p > (numBlocks - 1)) {
2725  p -= numBlocks;
2726  }
2727 
2728  if (procsTo_[p] != myProcID && packetsPerSend[p] > 0) {
2729  exports_view_type tmpSend =
2730  subview_offset(exports, sendPacketOffsets[p], packetsPerSend[p]);
2731 
2732  if (sendType == Details::DISTRIBUTOR_SEND) { // the default, so put it first
2733  send<int> (tmpSend,
2734  as<int> (tmpSend.size ()),
2735  procsTo_[p], tag, *comm_);
2736  }
2737  else if (sendType == Details::DISTRIBUTOR_RSEND) {
2738  readySend<int> (tmpSend,
2739  as<int> (tmpSend.size ()),
2740  procsTo_[p], tag, *comm_);
2741  }
2742  else if (sendType == Details::DISTRIBUTOR_ISEND) {
2743  exports_view_type tmpSendBuf =
2744  subview_offset (exports, sendPacketOffsets[p], packetsPerSend[p]);
2745  requests_.push_back (isend<int> (tmpSendBuf, procsTo_[p],
2746  tag, *comm_));
2747  }
2748  else if (sendType == Details::DISTRIBUTOR_SSEND) {
2749  ssend<int> (tmpSend,
2750  as<int> (tmpSend.size ()),
2751  procsTo_[p], tag, *comm_);
2752  }
2753  else {
2754  TEUCHOS_TEST_FOR_EXCEPTION(
2755  true, std::logic_error, "Tpetra::Distributor::doPosts(4 args): "
2756  "Invalid send type. We should never get here. "
2757  "Please report this bug to the Tpetra developers.");
2758  }
2759  }
2760  else { // "Sending" the message to myself
2761  selfNum = p;
2762  }
2763  }
2764 
2765  if (selfMessage_) {
2766  deep_copy_offset(imports, exports, selfReceiveOffset,
2767  sendPacketOffsets[selfNum], packetsPerSend[selfNum]);
2768  }
2769  if (debug_) {
2770  std::ostringstream os;
2771  os << myProcID << ": doPosts(4,fast) done" << endl;
2772  *out_ << os.str ();
2773  }
2774  }
2775  else { // data are not blocked by proc, use send buffer
2776  if (debug_) {
2777  std::ostringstream os;
2778  os << myProcID << ": doPosts(4,slow): posting sends" << endl;
2779  *out_ << os.str ();
2780  }
2781 
2782  // FIXME (mfh 05 Mar 2013) This may be broken for Isend.
2783  typedef typename ExpView::non_const_value_type Packet;
2784  typedef typename ExpView::array_layout Layout;
2785  typedef typename ExpView::device_type Device;
2786  typedef typename ExpView::memory_traits Mem;
2787  Kokkos::View<Packet*,Layout,Device,Mem> sendArray ("sendArray", maxNumPackets); // send buffer
2788 
2789  TEUCHOS_TEST_FOR_EXCEPTION(
2790  sendType == Details::DISTRIBUTOR_ISEND, std::logic_error,
2791  "Tpetra::Distributor::doPosts(3 args): The \"send buffer\" code path "
2792  "may not necessarily work with nonblocking sends.");
2793 
2794  Array<size_t> indicesOffsets (numExportPacketsPerLID.size(), 0);
2795  size_t ioffset = 0;
2796  for (int j=0; j<numExportPacketsPerLID.size(); ++j) {
2797  indicesOffsets[j] = ioffset;
2798  ioffset += numExportPacketsPerLID[j];
2799  }
2800 
2801  for (size_t i = 0; i < numBlocks; ++i) {
2802  size_t p = i + procIndex;
2803  if (p > (numBlocks - 1)) {
2804  p -= numBlocks;
2805  }
2806 
2807  if (procsTo_[p] != myProcID) {
2808  size_t sendArrayOffset = 0;
2809  size_t j = startsTo_[p];
2810  size_t numPacketsTo_p = 0;
2811  for (size_t k = 0; k < lengthsTo_[p]; ++k, ++j) {
2812  deep_copy_offset(sendArray, exports, sendArrayOffset,
2813  indicesOffsets[j], numExportPacketsPerLID[j]);
2814  sendArrayOffset += numExportPacketsPerLID[j];
2815  }
2816  if (numPacketsTo_p > 0) {
2817  ImpView tmpSend =
2818  subview_offset(sendArray, size_t(0), numPacketsTo_p);
2819 
2820  if (sendType == Details::DISTRIBUTOR_RSEND) {
2821  readySend<int> (tmpSend,
2822  as<int> (tmpSend.size ()),
2823  procsTo_[p], tag, *comm_);
2824  }
2825  else if (sendType == Details::DISTRIBUTOR_ISEND) {
2826  exports_view_type tmpSendBuf =
2827  subview_offset (sendArray, size_t(0), numPacketsTo_p);
2828  requests_.push_back (isend<int> (tmpSendBuf, procsTo_[p],
2829  tag, *comm_));
2830  }
2831  else if (sendType == Details::DISTRIBUTOR_SSEND) {
2832  ssend<int> (tmpSend,
2833  as<int> (tmpSend.size ()),
2834  procsTo_[p], tag, *comm_);
2835  }
2836  else { // if (sendType == Details::DISTRIBUTOR_SSEND)
2837  send<int> (tmpSend,
2838  as<int> (tmpSend.size ()),
2839  procsTo_[p], tag, *comm_);
2840  }
2841  }
2842  }
2843  else { // "Sending" the message to myself
2844  selfNum = p;
2845  selfIndex = startsTo_[p];
2846  }
2847  }
2848 
2849  if (selfMessage_) {
2850  for (size_t k = 0; k < lengthsTo_[selfNum]; ++k) {
2851  deep_copy_offset(imports, exports, selfReceiveOffset,
2852  indicesOffsets[selfIndex],
2853  numExportPacketsPerLID[selfIndex]);
2854  selfReceiveOffset += numExportPacketsPerLID[selfIndex];
2855  ++selfIndex;
2856  }
2857  }
2858  if (debug_) {
2859  std::ostringstream os;
2860  os << myProcID << ": doPosts(4,slow) done" << endl;
2861  *out_ << os.str ();
2862  }
2863  }
2864  }
2865 
2866  template <class ExpView, class ImpView>
2867  typename std::enable_if<(Kokkos::Impl::is_view<ExpView>::value && Kokkos::Impl::is_view<ImpView>::value)>::type
2868  Distributor::
2869  doReversePostsAndWaits (const ExpView& exports,
2870  size_t numPackets,
2871  const ImpView& imports)
2872  {
2873  doReversePosts (exports, numPackets, imports);
2874  doReverseWaits ();
2875  }
2876 
2877  template <class ExpView, class ImpView>
2878  typename std::enable_if<(Kokkos::Impl::is_view<ExpView>::value && Kokkos::Impl::is_view<ImpView>::value)>::type
2879  Distributor::
2880  doReversePostsAndWaits (const ExpView& exports,
2881  const Teuchos::ArrayView<const size_t>& numExportPacketsPerLID,
2882  const ImpView& imports,
2883  const Teuchos::ArrayView<const size_t>& numImportPacketsPerLID)
2884  {
2885  TEUCHOS_TEST_FOR_EXCEPTION(requests_.size() != 0, std::runtime_error,
2886  "Tpetra::Distributor::doReversePostsAndWaits(4 args): There are "
2887  << requests_.size() << " outstanding nonblocking messages pending. It "
2888  "is incorrect to call this method with posts outstanding.");
2889 
2890  doReversePosts (exports, numExportPacketsPerLID, imports,
2891  numImportPacketsPerLID);
2892  doReverseWaits ();
2893  }
2894 
2895  template <class ExpView, class ImpView>
2896  typename std::enable_if<(Kokkos::Impl::is_view<ExpView>::value && Kokkos::Impl::is_view<ImpView>::value)>::type
2897  Distributor::
2898  doReversePosts (const ExpView &exports,
2899  size_t numPackets,
2900  const ImpView &imports)
2901  {
2902  // FIXME (mfh 29 Mar 2012) WHY?
2903  TEUCHOS_TEST_FOR_EXCEPTION(
2904  ! indicesTo_.empty (), std::runtime_error,
2905  "Tpetra::Distributor::doReversePosts(3 args): Can only do "
2906  "reverse communication when original data are blocked by process.");
2907  if (reverseDistributor_.is_null ()) {
2908  createReverseDistributor ();
2909  }
2910  reverseDistributor_->doPosts (exports, numPackets, imports);
2911  }
2912 
2913  template <class ExpView, class ImpView>
2914  typename std::enable_if<(Kokkos::Impl::is_view<ExpView>::value && Kokkos::Impl::is_view<ImpView>::value)>::type
2915  Distributor::
2916  doReversePosts (const ExpView &exports,
2917  const Teuchos::ArrayView<const size_t>& numExportPacketsPerLID,
2918  const ImpView &imports,
2919  const Teuchos::ArrayView<const size_t>& numImportPacketsPerLID)
2920  {
2921  // FIXME (mfh 29 Mar 2012) WHY?
2922  TEUCHOS_TEST_FOR_EXCEPTION(
2923  ! indicesTo_.empty (), std::runtime_error,
2924  "Tpetra::Distributor::doReversePosts(3 args): Can only do "
2925  "reverse communication when original data are blocked by process.");
2926  if (reverseDistributor_.is_null ()) {
2927  createReverseDistributor ();
2928  }
2929  reverseDistributor_->doPosts (exports, numExportPacketsPerLID,
2930  imports, numImportPacketsPerLID);
2931  }
2932 
2933  template <class OrdinalType>
2934  void Distributor::
2935  computeSends (const Teuchos::ArrayView<const OrdinalType> & importGIDs,
2936  const Teuchos::ArrayView<const int> & importProcIDs,
2937  Teuchos::Array<OrdinalType> & exportGIDs,
2938  Teuchos::Array<int> & exportProcIDs)
2939  {
2940  // NOTE (mfh 19 Apr 2012): There was a note on this code saying:
2941  // "assumes that size_t >= Ordinal". The code certainly does
2942  // assume that sizeof(size_t) >= sizeof(OrdinalType) as well as
2943  // sizeof(size_t) >= sizeof(int). This is because it casts the
2944  // OrdinalType elements of importGIDs (along with their
2945  // corresponding process IDs, as int) to size_t, and does a
2946  // doPostsAndWaits<size_t>() to send the packed data.
2947  using Teuchos::Array;
2948  using Teuchos::ArrayView;
2949  using std::endl;
2950  typedef typename ArrayView<const OrdinalType>::size_type size_type;
2951 
2952  Teuchos::OSTab tab (out_);
2953  const int myRank = comm_->getRank ();
2954  if (debug_) {
2955  std::ostringstream os;
2956  os << myRank << ": computeSends" << endl;
2957  *out_ << os.str ();
2958  }
2959 
2960  TEUCHOS_TEST_FOR_EXCEPTION(
2961  importGIDs.size () != importProcIDs.size (), std::invalid_argument,
2962  "Tpetra::Distributor::computeSends: On Process " << myRank << ": "
2963  "importProcIDs.size() = " << importProcIDs.size ()
2964  << " != importGIDs.size() = " << importGIDs.size () << ".");
2965 
2966  const size_type numImports = importProcIDs.size ();
2967  Array<size_t> importObjs (2*numImports);
2968  // Pack pairs (importGIDs[i], my process ID) to send into importObjs.
2969  for (size_type i = 0; i < numImports; ++i) {
2970  importObjs[2*i] = static_cast<size_t> (importGIDs[i]);
2971  importObjs[2*i+1] = static_cast<size_t> (myRank);
2972  }
2973  //
2974  // Use a temporary Distributor to send the (importGIDs[i], myRank)
2975  // pairs to importProcIDs[i].
2976  //
2977  Distributor tempPlan (comm_, out_);
2978  if (debug_) {
2979  std::ostringstream os;
2980  os << myRank << ": computeSends: tempPlan.createFromSends" << endl;
2981  *out_ << os.str ();
2982  }
2983 
2984  // mfh 20 Mar 2014: An extra-cautious cast from unsigned to
2985  // signed, in order to forestall any possible causes for Bug 6069.
2986  const size_t numExportsAsSizeT = tempPlan.createFromSends (importProcIDs);
2987  const size_type numExports = static_cast<size_type> (numExportsAsSizeT);
2988  TEUCHOS_TEST_FOR_EXCEPTION(
2989  numExports < 0, std::logic_error, "Tpetra::Distributor::computeSends: "
2990  "tempPlan.createFromSends() returned numExports = " << numExportsAsSizeT
2991  << " as a size_t, which overflows to " << numExports << " when cast to "
2992  << Teuchos::TypeNameTraits<size_type>::name () << ". "
2993  "Please report this bug to the Tpetra developers.");
2994  TEUCHOS_TEST_FOR_EXCEPTION(
2995  static_cast<size_type> (tempPlan.getTotalReceiveLength ()) != numExports,
2996  std::logic_error, "Tpetra::Distributor::computeSends: tempPlan.getTotal"
2997  "ReceiveLength() = " << tempPlan.getTotalReceiveLength () << " != num"
2998  "Exports = " << numExports << ". Please report this bug to the "
2999  "Tpetra developers.");
3000 
3001  if (numExports > 0) {
3002  exportGIDs.resize (numExports);
3003  exportProcIDs.resize (numExports);
3004  }
3005 
3006  // exportObjs: Packed receive buffer. (exportObjs[2*i],
3007  // exportObjs[2*i+1]) will give the (GID, PID) pair for export i,
3008  // after tempPlan.doPostsAndWaits(...) finishes below.
3009  //
3010  // FIXME (mfh 19 Mar 2014) This only works if OrdinalType fits in
3011  // size_t. This issue might come up, for example, on a 32-bit
3012  // machine using 64-bit global indices. I will add a check here
3013  // for that case.
3014  TEUCHOS_TEST_FOR_EXCEPTION(
3015  sizeof (size_t) < sizeof (OrdinalType), std::logic_error,
3016  "Tpetra::Distributor::computeSends: sizeof(size_t) = " << sizeof(size_t)
3017  << " < sizeof(" << Teuchos::TypeNameTraits<OrdinalType>::name () << ") = "
3018  << sizeof (OrdinalType) << ". This violates an assumption of the "
3019  "method. It's not hard to work around (just use Array<OrdinalType> as "
3020  "the export buffer, not Array<size_t>), but we haven't done that yet. "
3021  "Please report this bug to the Tpetra developers.");
3022 
3023  TEUCHOS_TEST_FOR_EXCEPTION(
3024  tempPlan.getTotalReceiveLength () < static_cast<size_t> (numExports),
3025  std::logic_error,
3026  "Tpetra::Distributor::computeSends: tempPlan.getTotalReceiveLength() = "
3027  << tempPlan.getTotalReceiveLength() << " < numExports = " << numExports
3028  << ". Please report this bug to the Tpetra developers.");
3029 
3030  Array<size_t> exportObjs (tempPlan.getTotalReceiveLength () * 2);
3031  if (debug_) {
3032  std::ostringstream os;
3033  os << myRank << ": computeSends: tempPlan.doPostsAndWaits" << endl;
3034  *out_ << os.str ();
3035  }
3036  tempPlan.doPostsAndWaits<size_t> (importObjs (), 2, exportObjs ());
3037 
3038  // Unpack received (GID, PID) pairs into exportIDs resp. exportProcIDs.
3039  for (size_type i = 0; i < numExports; ++i) {
3040  exportGIDs[i] = static_cast<OrdinalType> (exportObjs[2*i]);
3041  exportProcIDs[i] = static_cast<int> (exportObjs[2*i+1]);
3042  }
3043 
3044  if (debug_) {
3045  std::ostringstream os;
3046  os << myRank << ": computeSends done" << endl;
3047  *out_ << os.str ();
3048  }
3049  }
3050 
3051  template <class OrdinalType>
3052  void Distributor::
3053  createFromRecvs (const Teuchos::ArrayView<const OrdinalType> &remoteGIDs,
3054  const Teuchos::ArrayView<const int> &remoteProcIDs,
3055  Teuchos::Array<OrdinalType> &exportGIDs,
3056  Teuchos::Array<int> &exportProcIDs)
3057  {
3058  using std::endl;
3059 
3060  Teuchos::OSTab tab (out_);
3061  const int myRank = comm_->getRank();
3062 
3063  if (debug_) {
3064  *out_ << myRank << ": createFromRecvs" << endl;
3065  }
3066 
3067 #ifdef HAVE_TPETRA_DEBUG
3068  using Teuchos::outArg;
3069  using Teuchos::reduceAll;
3070 
3071  // In debug mode, first test locally, then do an all-reduce to
3072  // make sure that all processes passed.
3073  const int errProc =
3074  (remoteGIDs.size () != remoteProcIDs.size ()) ? myRank : -1;
3075  int maxErrProc = -1;
3076  reduceAll<int, int> (*comm_, Teuchos::REDUCE_MAX, errProc, outArg (maxErrProc));
3077  TEUCHOS_TEST_FOR_EXCEPTION(maxErrProc != -1, std::runtime_error,
3078  Teuchos::typeName (*this) << "::createFromRecvs(): lists of remote IDs "
3079  "and remote process IDs must have the same size on all participating "
3080  "processes. Maximum process ID with error: " << maxErrProc << ".");
3081 #else // NOT HAVE_TPETRA_DEBUG
3082 
3083  // In non-debug mode, just test locally.
3084  TEUCHOS_TEST_FOR_EXCEPTION(
3085  remoteGIDs.size () != remoteProcIDs.size (), std::invalid_argument,
3086  Teuchos::typeName (*this) << "::createFromRecvs<" <<
3087  Teuchos::TypeNameTraits<OrdinalType>::name () << ">(): On Process " <<
3088  myRank << ": remoteGIDs.size() = " << remoteGIDs.size () << " != "
3089  "remoteProcIDs.size() = " << remoteProcIDs.size () << ".");
3090 #endif // HAVE_TPETRA_DEBUG
3091 
3092  computeSends (remoteGIDs, remoteProcIDs, exportGIDs, exportProcIDs);
3093 
3094  const size_t numProcsSendingToMe = createFromSends (exportProcIDs ());
3095 
3096  if (debug_) {
3097  // NOTE (mfh 20 Mar 2014) If remoteProcIDs could contain
3098  // duplicates, then its length might not be the right check here,
3099  // even if we account for selfMessage_. selfMessage_ is set in
3100  // createFromSends.
3101  std::ostringstream os;
3102  os << "Proc " << myRank << ": {numProcsSendingToMe: "
3103  << numProcsSendingToMe << ", remoteProcIDs.size(): "
3104  << remoteProcIDs.size () << ", selfMessage_: "
3105  << (selfMessage_ ? "true" : "false") << "}" << std::endl;
3106  std::cerr << os.str ();
3107  }
3108 
3109  if (debug_) {
3110  *out_ << myRank << ": createFromRecvs done" << endl;
3111  }
3112 
3113  howInitialized_ = Details::DISTRIBUTOR_INITIALIZED_BY_CREATE_FROM_RECVS;
3114  }
3115 
3116 
3117 } // namespace Tpetra
3118 
3119 #endif // TPETRA_DISTRIBUTOR_HPP
void doPostsAndWaits(const Teuchos::ArrayView< const Packet > &exports, size_t numPackets, const Teuchos::ArrayView< Packet > &imports)
Execute the (forward) communication plan.
Namespace Tpetra contains the class and methods constituting the Tpetra library.
Details::EDistributorHowInitialized howInitialized() const
Return an enum indicating whether and how a Distributor was initialized.
Teuchos::RCP< Distributor > getReverse() const
A reverse communication plan Distributor.
EDistributorHowInitialized
Enum indicating how and whether a Distributor was initialized.
Teuchos::RCP< const Teuchos::ParameterList > getValidParameters() const
List of valid Distributor parameters.
size_t getTotalReceiveLength() const
Total number of values this process will receive from other processes.
size_t getMaxSendLength() const
Maximum number of values this process will send to another single process.
void swap(Distributor &rhs)
Swap the contents of rhs with those of *this.
std::string DistributorSendTypeEnumToString(EDistributorSendType sendType)
Convert an EDistributorSendType enum value to a string.
Teuchos::ArrayView< const size_t > getLengthsFrom() const
Number of values this process will receive from each process.
void doReversePostsAndWaits(const Teuchos::ArrayView< const Packet > &exports, size_t numPackets, const Teuchos::ArrayView< Packet > &imports)
Execute the reverse communication plan.
size_t getNumReceives() const
The number of processes from which we will receive data.
size_t getNumSends() const
The number of processes to which we will send data.
Implementation details of Tpetra.
size_t createFromSends(const Teuchos::ArrayView< const int > &exportProcIDs)
Set up Distributor using list of process ranks to which this process will send.
Teuchos::ArrayView< const size_t > getLengthsTo() const
Number of values this process will send to each process.
void doPosts(const Teuchos::ArrayRCP< const Packet > &exports, size_t numPackets, const Teuchos::ArrayRCP< Packet > &imports)
Post the data for a forward plan, but do not execute the waits yet.
void describe(Teuchos::FancyOStream &out, const Teuchos::EVerbosityLevel verbLevel=Teuchos::Describable::verbLevel_default) const
Describe this object in a human-readable way to the given output stream.
void createFromSendsAndRecvs(const Teuchos::ArrayView< const int > &exportProcIDs, const Teuchos::ArrayView< const int > &remoteProcIDs)
Set up Distributor using list of process ranks to which to send, and list of process ranks from which...
Teuchos::ArrayView< const int > getProcsFrom() const
Ranks of the processes sending values to this process.
Sets up and executes a communication plan for a Tpetra DistObject.
void setParameterList(const Teuchos::RCP< Teuchos::ParameterList > &plist)
Set Distributor parameters.
void doReversePosts(const Teuchos::ArrayRCP< const Packet > &exports, size_t numPackets, const Teuchos::ArrayRCP< Packet > &imports)
Post the data for a reverse plan, but do not execute the waits yet.
Teuchos::ArrayView< const int > getProcsTo() const
Ranks of the processes to which this process will send values.
virtual ~Distributor()
Destructor (virtual for memory safety).
Teuchos::Array< std::string > distributorSendTypes()
Valid values for Distributor&#39;s "Send type" parameter.
std::string DistributorHowInitializedEnumToString(EDistributorHowInitialized how)
Convert an EDistributorHowInitialized enum value to a string.
Stand-alone utility functions and macros.
void getLastDoStatistics(size_t &bytes_sent, size_t &bytes_recvd) const
Information on the last call to do/doReverse.
std::string description() const
Return a one-line description of this object.
bool hasSelfMessage() const
Whether the calling process will send or receive messages to itself.
void createFromRecvs(const Teuchos::ArrayView< const Ordinal > &remoteIDs, const Teuchos::ArrayView< const int > &remoteProcIDs, Teuchos::Array< Ordinal > &exportIDs, Teuchos::Array< int > &exportProcIDs)
Set up Distributor using list of process ranks from which to receive.
EDistributorSendType
The type of MPI send that Distributor should use.
Distributor(const Teuchos::RCP< const Teuchos::Comm< int > > &comm)
Construct using the specified communicator and default parameters.