Tpetra parallel linear algebra  Version of the Day
Tpetra_Import_Util2.hpp
Go to the documentation of this file.
1 // @HEADER
2 // ***********************************************************************
3 //
4 // Tpetra: Templated Linear Algebra Services Package
5 // Copyright (2008) Sandia Corporation
6 //
7 // Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
8 // the U.S. Government retains certain rights in this software.
9 //
10 // Redistribution and use in source and binary forms, with or without
11 // modification, are permitted provided that the following conditions are
12 // met:
13 //
14 // 1. Redistributions of source code must retain the above copyright
15 // notice, this list of conditions and the following disclaimer.
16 //
17 // 2. Redistributions in binary form must reproduce the above copyright
18 // notice, this list of conditions and the following disclaimer in the
19 // documentation and/or other materials provided with the distribution.
20 //
21 // 3. Neither the name of the Corporation nor the names of the
22 // contributors may be used to endorse or promote products derived from
23 // this software without specific prior written permission.
24 //
25 // THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
26 // EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
27 // IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
28 // PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
29 // CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
30 // EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
31 // PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
32 // PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
33 // LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
34 // NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
35 // SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
36 //
37 // Questions? Contact Michael A. Heroux (maherou@sandia.gov)
38 //
39 // ************************************************************************
40 // @HEADER
41 
42 #ifndef TPETRA_IMPORT_UTIL2_HPP
43 #define TPETRA_IMPORT_UTIL2_HPP
44 
49 
50 #include "Tpetra_ConfigDefs.hpp"
52 #include "Tpetra_Import.hpp"
53 #include "Tpetra_HashTable.hpp"
54 #include "Tpetra_Map.hpp"
55 #include "Tpetra_Util.hpp"
56 #include "Tpetra_Distributor.hpp"
57 #include "Kokkos_DualView.hpp"
58 #include <Teuchos_Array.hpp>
59 #include <utility>
60 
61 // Tpetra::CrsMatrix uses the functions below in its implementation.
62 // To avoid a circular include issue, only include the declarations
63 // for CrsMatrix. We will include the definition after the functions
64 // here have been defined.
66 
67 
68 namespace { // (anonymous)
69 
70  template<class T, class D>
71  Kokkos::View<T*, D, Kokkos::MemoryUnmanaged>
72  getNonconstView (const Teuchos::ArrayView<T>& x)
73  {
74  typedef Kokkos::View<T*, D, Kokkos::MemoryUnmanaged> view_type;
75  typedef typename view_type::size_type size_type;
76  const size_type numEnt = static_cast<size_type> (x.size ());
77  return view_type (x.getRawPtr (), numEnt);
78  }
79 
80  template<class T, class D>
81  Kokkos::View<const T*, D, Kokkos::MemoryUnmanaged>
82  getConstView (const Teuchos::ArrayView<const T>& x)
83  {
84  typedef Kokkos::View<const T*, D, Kokkos::MemoryUnmanaged> view_type;
85  typedef typename view_type::size_type size_type;
86  const size_type numEnt = static_cast<size_type> (x.size ());
87  return view_type (x.getRawPtr (), numEnt);
88  }
89 
90  // For a given Kokkos (execution or memory) space, return both its
91  // execution space, and the corresponding host execution space.
92  template<class Space>
93  struct GetHostExecSpace {
94  typedef typename Space::execution_space execution_space;
95  typedef typename Kokkos::View<int*, execution_space>::HostMirror::execution_space host_execution_space;
96  };
97 
98 } // namespace (anonymous)
99 
100 namespace Tpetra {
101 namespace Import_Util {
102 
117 template<typename Scalar,
118  typename LocalOrdinal,
119  typename GlobalOrdinal,
120  typename Node>
121 void
122 packAndPrepareWithOwningPIDs (const CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>& SourceMatrix,
123  const Teuchos::ArrayView<const LocalOrdinal>& exportLIDs,
124  Kokkos::DualView<char*, typename Node::device_type>& exports,
125  const Teuchos::ArrayView<size_t>& numPacketsPerLID,
126  size_t& constantNumPackets,
127  Distributor &distor,
128  const Teuchos::ArrayView<const int>& SourcePids);
129 
145 template<typename Scalar,
146  typename LocalOrdinal,
147  typename GlobalOrdinal,
148  typename Node>
149 size_t
150 unpackAndCombineWithOwningPIDsCount (const CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>& SourceMatrix,
151  const Teuchos::ArrayView<const LocalOrdinal> &importLIDs,
152  const Teuchos::ArrayView<const char> &imports,
153  const Teuchos::ArrayView<const size_t>& numPacketsPerLID,
154  size_t constantNumPackets,
155  Distributor &distor,
156  CombineMode combineMode,
157  size_t numSameIDs,
158  const Teuchos::ArrayView<const LocalOrdinal>& permuteToLIDs,
159  const Teuchos::ArrayView<const LocalOrdinal>& permuteFromLIDs);
160 
175 template<typename Scalar,
176  typename LocalOrdinal,
177  typename GlobalOrdinal,
178  typename Node>
179 void
180 unpackAndCombineIntoCrsArrays (const CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>& SourceMatrix,
181  const Teuchos::ArrayView<const LocalOrdinal>& importLIDs,
182  const Teuchos::ArrayView<const char>& imports,
183  const Teuchos::ArrayView<const size_t>& numPacketsPerLID,
184  size_t constantNumPackets,
185  Distributor &distor,
186  CombineMode combineMode,
187  size_t numSameIDs,
188  const Teuchos::ArrayView<const LocalOrdinal>& permuteToLIDs,
189  const Teuchos::ArrayView<const LocalOrdinal>& permuteFromLIDs,
190  size_t TargetNumRows,
191  size_t TargetNumNonzeros,
192  int MyTargetPID,
193  const Teuchos::ArrayView<size_t>& rowPointers,
194  const Teuchos::ArrayView<GlobalOrdinal>& columnIndices,
195  const Teuchos::ArrayView<Scalar>& values,
196  const Teuchos::ArrayView<const int>& SourcePids,
197  Teuchos::Array<int>& TargetPids);
198 
201 template<typename Scalar, typename Ordinal>
202 void
203 sortCrsEntries (const Teuchos::ArrayView<size_t>& CRS_rowptr,
204  const Teuchos::ArrayView<Ordinal>& CRS_colind,
205  const Teuchos::ArrayView<Scalar>&CRS_vals);
206 
211 template<typename Scalar, typename Ordinal>
212 void
213 sortAndMergeCrsEntries (const Teuchos::ArrayView<size_t>& CRS_rowptr,
214  const Teuchos::ArrayView<Ordinal>& CRS_colind,
215  const Teuchos::ArrayView<Scalar>& CRS_vals);
216 
232 template <typename LocalOrdinal, typename GlobalOrdinal, typename Node>
233 void
234 lowCommunicationMakeColMapAndReindex (const Teuchos::ArrayView<const size_t> &rowPointers,
235  const Teuchos::ArrayView<LocalOrdinal> &columnIndices_LID,
236  const Teuchos::ArrayView<GlobalOrdinal> &columnIndices_GID,
237  const Teuchos::RCP<const Tpetra::Map<LocalOrdinal,GlobalOrdinal,Node> > & domainMap,
238  const Teuchos::ArrayView<const int> &owningPids,
239  Teuchos::Array<int> &remotePids,
240  Teuchos::RCP<const Tpetra::Map<LocalOrdinal,GlobalOrdinal,Node> > & colMap);
241 
242 
243 } // namespace Import_Util
244 } // namespace Tpetra
245 
246 
247 //
248 // Implementations
249 //
250 
251 namespace { // (anonymous)
252 
269  template<class LO, class GO, class D>
270  size_t
271  packRowCount (const size_t numEnt,
272  const size_t numBytesPerValue)
273  {
275 
276  if (numEnt == 0) {
277  // Empty rows always take zero bytes, to ensure sparsity.
278  return 0;
279  }
280  else {
281  // We store the number of entries as a local index (LO).
282  LO numEntLO = 0; // packValueCount wants this.
283  GO gid;
284  int lid;
285  const size_t numEntLen = PackTraits<LO, D>::packValueCount (numEntLO);
286  const size_t gidsLen = numEnt * PackTraits<GO, D>::packValueCount (gid);
287  const size_t pidsLen = numEnt * PackTraits<int, D>::packValueCount (lid);
288  const size_t valsLen = numEnt * numBytesPerValue;
289  return numEntLen + gidsLen + pidsLen + valsLen;
290  }
291  }
292 
293  template<class LO, class D>
294  size_t
295  unpackRowCount (const typename Tpetra::Details::PackTraits<LO, D>::input_buffer_type& imports,
296  const size_t offset,
297  const size_t numBytes,
298  const size_t numBytesPerValue)
299  {
300  using Kokkos::subview;
302  typedef typename PackTraits<LO, D>::input_buffer_type input_buffer_type;
303  typedef typename input_buffer_type::size_type size_type;
304 
305  if (numBytes == 0) {
306  // Empty rows always take zero bytes, to ensure sparsity.
307  return static_cast<size_t> (0);
308  }
309  else {
310  LO numEntLO = 0;
311  const size_t theNumBytes = PackTraits<LO, D>::packValueCount (numEntLO);
312 #ifdef HAVE_TPETRA_DEBUG
313  TEUCHOS_TEST_FOR_EXCEPTION(
314  theNumBytes > numBytes, std::logic_error, "unpackRowCount: "
315  "theNumBytes = " << theNumBytes << " < numBytes = " << numBytes
316  << ".");
317 #endif // HAVE_TPETRA_DEBUG
318  const std::pair<size_type, size_type> rng (offset, offset + theNumBytes);
319  input_buffer_type inBuf = subview (imports, rng); // imports (offset, theNumBytes);
320  const size_t actualNumBytes = PackTraits<LO, D>::unpackValue (numEntLO, inBuf);
321  (void)actualNumBytes;
322 #ifdef HAVE_TPETRA_DEBUG
323  TEUCHOS_TEST_FOR_EXCEPTION(
324  theNumBytes > numBytes, std::logic_error, "unpackRowCount: "
325  "actualNumBytes = " << actualNumBytes << " < numBytes = " << numBytes
326  << ".");
327 #endif // HAVE_TPETRA_DEBUG
328  return static_cast<size_t> (numEntLO);
329  }
330  }
331 
332  // Return the number of bytes packed.
333  template<class ST, class LO, class GO, class D>
334  size_t
335  packRow (const typename Tpetra::Details::PackTraits<LO, D>::output_buffer_type& exports,
336  const size_t offset,
337  const size_t numEnt,
341  const size_t numBytesPerValue)
342  {
343  using Kokkos::subview;
345  // NOTE (mfh 02 Feb 2015) This assumes that output_buffer_type is
346  // the same, no matter what type we're packing. It's a reasonable
347  // assumption, given that we go through the trouble of PackTraits
348  // just so that we can pack data of different types in the same
349  // buffer.
350  typedef typename PackTraits<LO, D>::output_buffer_type output_buffer_type;
351  typedef typename output_buffer_type::size_type size_type;
352  typedef typename std::pair<size_type, size_type> pair_type;
353 
354  if (numEnt == 0) {
355  // Empty rows always take zero bytes, to ensure sparsity.
356  return 0;
357  }
358 
359  const GO gid = 0; // packValueCount wants this
360  const LO numEntLO = static_cast<size_t> (numEnt);
361  const int pid = 0; // packValueCount wants this
362 
363  const size_t numEntBeg = offset;
364  const size_t numEntLen = PackTraits<LO, D>::packValueCount (numEntLO);
365  const size_t gidsBeg = numEntBeg + numEntLen;
366  const size_t gidsLen = numEnt * PackTraits<GO, D>::packValueCount (gid);
367  const size_t pidsBeg = gidsBeg + gidsLen;
368  const size_t pidsLen = numEnt * PackTraits<int, D>::packValueCount (pid);
369  const size_t valsBeg = pidsBeg + pidsLen;
370  const size_t valsLen = numEnt * numBytesPerValue;
371 
372  output_buffer_type numEntOut =
373  subview (exports, pair_type (numEntBeg, numEntBeg + numEntLen));
374  output_buffer_type gidsOut =
375  subview (exports, pair_type (gidsBeg, gidsBeg + gidsLen));
376  output_buffer_type pidsOut =
377  subview (exports, pair_type (pidsBeg, pidsBeg + pidsLen));
378  output_buffer_type valsOut =
379  subview (exports, pair_type (valsBeg, valsBeg + valsLen));
380 
381  size_t numBytesOut = 0;
382  numBytesOut += PackTraits<LO, D>::packValue (numEntOut, numEntLO);
383  numBytesOut += PackTraits<GO, D>::packArray (gidsOut, gidsIn, numEnt);
384  numBytesOut += PackTraits<int, D>::packArray (pidsOut, pidsIn, numEnt);
385  numBytesOut += PackTraits<ST, D>::packArray (valsOut, valsIn, numEnt);
386 
387  const size_t expectedNumBytes = numEntLen + gidsLen + pidsLen + valsLen;
388  TEUCHOS_TEST_FOR_EXCEPTION(
389  numBytesOut != expectedNumBytes, std::logic_error, "unpackRow: "
390  "numBytesOut = " << numBytesOut << " != expectedNumBytes = "
391  << expectedNumBytes << ".");
392 
393  return numBytesOut;
394  }
395 
396  // Return the number of bytes actually read / used.
397  template<class ST, class LO, class GO, class D>
398  size_t
399  unpackRow (const typename Tpetra::Details::PackTraits<GO, D>::output_array_type& gidsOut,
403  const size_t offset,
404  const size_t numBytes,
405  const size_t numEnt,
406  const size_t numBytesPerValue)
407  {
408  using Kokkos::subview;
410  // NOTE (mfh 02 Feb 2015) This assumes that input_buffer_type is
411  // the same, no matter what type we're unpacking. It's a
412  // reasonable assumption, given that we go through the trouble of
413  // PackTraits just so that we can pack data of different types in
414  // the same buffer.
415  typedef typename PackTraits<LO, D>::input_buffer_type input_buffer_type;
416  typedef typename input_buffer_type::size_type size_type;
417  typedef typename std::pair<size_type, size_type> pair_type;
418 
419  if (numBytes == 0) {
420  // Rows with zero bytes always have zero entries.
421  return 0;
422  }
423  TEUCHOS_TEST_FOR_EXCEPTION(
424  static_cast<size_t> (imports.dimension_0 ()) <= offset, std::logic_error,
425  "unpackRow: imports.dimension_0() = " << imports.dimension_0 () <<
426  " <= offset = " << offset << ".");
427  TEUCHOS_TEST_FOR_EXCEPTION(
428  static_cast<size_t> (imports.dimension_0 ()) < offset + numBytes,
429  std::logic_error, "unpackRow: imports.dimension_0() = "
430  << imports.dimension_0 () << " < offset + numBytes = "
431  << (offset + numBytes) << ".");
432 
433  const GO gid = 0; // packValueCount wants this
434  const LO lid = 0; // packValueCount wants this
435  const int pid = 0; // packValueCount wants this
436 
437  const size_t numEntBeg = offset;
438  const size_t numEntLen = PackTraits<LO, D>::packValueCount (lid);
439  const size_t gidsBeg = numEntBeg + numEntLen;
440  const size_t gidsLen = numEnt * PackTraits<GO, D>::packValueCount (gid);
441  const size_t pidsBeg = gidsBeg + gidsLen;
442  const size_t pidsLen = numEnt * PackTraits<int, D>::packValueCount (pid);
443  const size_t valsBeg = pidsBeg + pidsLen;
444  const size_t valsLen = numEnt * numBytesPerValue;
445 
446  input_buffer_type numEntIn = subview (imports, pair_type (numEntBeg, numEntBeg + numEntLen));
447  input_buffer_type gidsIn = subview (imports, pair_type (gidsBeg, gidsBeg + gidsLen));
448  input_buffer_type pidsIn = subview (imports, pair_type (pidsBeg, pidsBeg + pidsLen));
449  input_buffer_type valsIn = subview (imports, pair_type (valsBeg, valsBeg + valsLen));
450 
451  size_t numBytesOut = 0;
452  LO numEntOut;
453  numBytesOut += PackTraits<LO, D>::unpackValue (numEntOut, numEntIn);
454  TEUCHOS_TEST_FOR_EXCEPTION(
455  static_cast<size_t> (numEntOut) != numEnt, std::logic_error,
456  "unpackRow: Expected number of entries " << numEnt
457  << " != actual number of entries " << numEntOut << ".");
458 
459  numBytesOut += PackTraits<GO, D>::unpackArray (gidsOut, gidsIn, numEnt);
460  numBytesOut += PackTraits<int, D>::unpackArray (pidsOut, pidsIn, numEnt);
461  numBytesOut += PackTraits<ST, D>::unpackArray (valsOut, valsIn, numEnt);
462 
463  TEUCHOS_TEST_FOR_EXCEPTION(
464  numBytesOut != numBytes, std::logic_error, "unpackRow: numBytesOut = "
465  << numBytesOut << " != numBytes = " << numBytes << ".");
466  const size_t expectedNumBytes = numEntLen + gidsLen + pidsLen + valsLen;
467  TEUCHOS_TEST_FOR_EXCEPTION(
468  numBytesOut != expectedNumBytes, std::logic_error, "unpackRow: "
469  "numBytesOut = " << numBytesOut << " != expectedNumBytes = "
470  << expectedNumBytes << ".");
471  return numBytesOut;
472  }
473 
474  // mfh 28 Apr 2016: Sometimes we have a raw host array, and we need
475  // to make a Kokkos::View out of it that lives in a certain memory
476  // space. We don't want to make a deep copy of the input array if
477  // we don't need to, but if the memory spaces are different, we need
478  // to. The following code does that. The struct is an
479  // implementation detail, and the "free" function
480  // get1DConstViewOfUnmanagedArray is the interface to call.
481 
482  template<class ST, class DT,
483  const bool outputIsHostMemory =
484  std::is_same<typename DT::memory_space, Kokkos::HostSpace>::value>
485  struct Get1DConstViewOfUnmanagedHostArray {};
486 
487  template<class ST, class DT>
488  struct Get1DConstViewOfUnmanagedHostArray<ST, DT, true> {
489  typedef Kokkos::View<const ST*, Kokkos::HostSpace, Kokkos::MemoryUnmanaged> output_view_type;
490 
491  static output_view_type
492  getView (const char /* label */ [], const ST* x_raw, const size_t x_len)
493  {
494  // We can return the input array, wrapped as an unmanaged View.
495  // Ignore the label, since unmanaged Views don't have labels.
496  return output_view_type (x_raw, x_len);
497  }
498  };
499 
500  template<class ST, class DT>
501  struct Get1DConstViewOfUnmanagedHostArray<ST, DT, false> {
502  typedef Kokkos::View<const ST*, Kokkos::HostSpace, Kokkos::MemoryUnmanaged> input_view_type;
503  typedef Kokkos::View<const ST*, DT> output_view_type;
504 
505  static output_view_type
506  getView (const char label[], const ST* x_raw, const size_t x_len)
507  {
508  input_view_type x_in (x_raw, x_len);
509  // The memory spaces are different, so we have to create a new
510  // View which is a deep copy of the input array.
511  //
512  // FIXME (mfh 28 Apr 2016) This needs to be converted to
513  // std::string, else the compiler can't figure out what
514  // constructor we're calling.
515  Kokkos::View<ST*, DT> x_out (std::string (label), x_len);
516  Kokkos::deep_copy (x_out, x_in);
517  return x_out;
518  }
519  };
520 
521  template<class ST, class DT>
522  typename Get1DConstViewOfUnmanagedHostArray<ST, DT>::output_view_type
523  get1DConstViewOfUnmanagedHostArray (const char label[], const ST* x_raw, const size_t x_len)
524  {
525  return Get1DConstViewOfUnmanagedHostArray<ST, DT>::getView (label, x_raw, x_len);
526  }
527 
528 } // namespace (anonymous)
529 
530 
531 namespace Tpetra {
532 namespace Import_Util {
533 
534 template<typename Scalar,
535  typename LocalOrdinal,
536  typename GlobalOrdinal,
537  typename Node>
538 void
540  const Teuchos::ArrayView<const LocalOrdinal>& exportLIDs,
541  Kokkos::DualView<char*, typename Node::device_type>& exports,
542  const Teuchos::ArrayView<size_t>& numPacketsPerLID,
543  size_t& constantNumPackets,
544  Distributor &distor,
545  const Teuchos::ArrayView<const int>& SourcePids)
546 {
548  using Kokkos::MemoryUnmanaged;
549  using Kokkos::subview;
550  using Kokkos::View;
551  using Teuchos::Array;
552  using Teuchos::ArrayView;
553  using Teuchos::as;
554  using Teuchos::RCP;
555  typedef LocalOrdinal LO;
556  typedef GlobalOrdinal GO;
558  typedef typename matrix_type::impl_scalar_type ST;
559  typedef typename Node::device_type device_type;
560  typedef typename device_type::execution_space execution_space;
561  // The device type of the HostMirror of a device_type View. This
562  // might not necessarily be Kokkos::HostSpace! In particular, if
563  // device_type::memory_space is CudaUVMSpace and the CMake option
564  // Kokkos_ENABLE_Cuda_UVM is ON, the corresponding HostMirror's
565  // memory space is also CudaUVMSpace.
566  typedef typename Kokkos::DualView<char*, device_type>::t_host::device_type HDS;
567  typedef Map<LocalOrdinal,GlobalOrdinal,Node> map_type;
568  typedef typename ArrayView<const LO>::size_type size_type;
569  typedef std::pair<typename View<int*, HDS>::size_type,
570  typename View<int*, HDS>::size_type> pair_type;
571  const char prefix[] = "Tpetra::Import_Util::packAndPrepareWithOwningPIDs: ";
572 
573  // FIXME (mfh 03 Jan 2015) Currently, it might be the case that if a
574  // graph or matrix owns no entries on a process, then it reports
575  // that it is neither locally nor globally indexed, even if the
576  // graph or matrix has a column Map. This should change once we get
577  // rid of lazy initialization in CrsGraph and CrsMatrix.
578  TEUCHOS_TEST_FOR_EXCEPTION(
579  ! SourceMatrix.isLocallyIndexed (), std::invalid_argument,
580  prefix << "SourceMatrix must be locally indexed.");
581  TEUCHOS_TEST_FOR_EXCEPTION(
582  SourceMatrix.getColMap ().is_null (), std::logic_error,
583  prefix << "The source matrix claims to be locally indexed, but its column "
584  "Map is null. This should never happen. Please report this bug to the "
585  "Tpetra developers.");
586  const size_type numExportLIDs = exportLIDs.size ();
587  TEUCHOS_TEST_FOR_EXCEPTION(
588  numExportLIDs != numPacketsPerLID.size (), std::invalid_argument, prefix
589  << "exportLIDs.size() = " << numExportLIDs << "!= numPacketsPerLID.size() "
590  << " = " << numPacketsPerLID.size () << ".");
591  TEUCHOS_TEST_FOR_EXCEPTION(
592  static_cast<size_t> (SourcePids.size ()) != SourceMatrix.getColMap ()->getNodeNumElements (),
593  std::invalid_argument, prefix << "SourcePids.size() = "
594  << SourcePids.size ()
595  << "!= SourceMatrix.getColMap()->getNodeNumElements() = "
596  << SourceMatrix.getColMap ()->getNodeNumElements () << ".");
597 
598  // This tells the caller that different rows may have different
599  // numbers of entries. That is, the number of packets per LID might
600  // not be a constant.
601  constantNumPackets = 0;
602 
603  // Compute the number of bytes ("packets") per row to pack. While
604  // we're at it, compute the total number of matrix entries to send,
605  // and the max number of entries in any of the rows we're sending.
606  size_t totalNumBytes = 0;
607  size_t totalNumEntries = 0;
608  size_t maxRowLength = 0;
609  for (size_type i = 0; i < numExportLIDs; ++i) {
610  const LO lclRow = exportLIDs[i];
611  const size_t numEnt = SourceMatrix.getNumEntriesInLocalRow (lclRow);
612 
613  // The 'if' branch implicitly assumes that packRowCount() returns
614  // zero if numEnt == 0.
615  size_t numBytesPerValue = 0;
616  if (numEnt > 0) {
617  // Get a locally indexed view of the current row's data. If the
618  // current row has > 0 entries, we need an entry in order to
619  // figure out the byte count of the packed row. (We really only
620  // need it if ST's size is determined at run time.)
621  ArrayView<const Scalar> valsView;
622  ArrayView<const LO> lidsView;
623  SourceMatrix.getLocalRowView (lclRow, lidsView, valsView);
624  const ST* valsViewRaw = reinterpret_cast<const ST*> (valsView.getRawPtr ());
625  View<const ST*, Kokkos::HostSpace, MemoryUnmanaged> valsViewK (valsViewRaw, valsView.size ());
626  TEUCHOS_TEST_FOR_EXCEPTION(
627  static_cast<size_t> (valsViewK.dimension_0 ()) != numEnt,
628  std::logic_error, prefix << "Local row " << i << " claims to have "
629  << numEnt << "entry/ies, but the View returned by getLocalRowView() "
630  "has " << valsViewK.dimension_0 () << " entry/ies. This should never "
631  "happen. Please report this bug to the Tpetra developers.");
632 
633  // NOTE (mfh 07 Feb 2015) Since we're using the host memory
634  // space here for now, this doesn't assume UVM. That may change
635  // in the future, if we ever start packing on the device.
636  //
637  // FIXME (mfh 28 Apr 2016) For now, we assume that the value
638  // returned by packValueCount is independent of the memory
639  // space. This assumption helps with #227.
640  numBytesPerValue = PackTraits<ST, Kokkos::HostSpace>::packValueCount (valsViewK(0));
641  }
642 
643  // FIXME (mfh 28 Apr 2016) For now, we assume that the value
644  // returned by packRowCount is independent of the memory space.
645  // This assumption helps with #227.
646  const size_t numBytes =
647  packRowCount<LO, GO, Kokkos::HostSpace> (numEnt, numBytesPerValue);
648  numPacketsPerLID[i] = numBytes;
649  totalNumBytes += numBytes;
650  totalNumEntries += numEnt;
651  maxRowLength = std::max (maxRowLength, numEnt);
652  }
653 
654  // We use a "struct of arrays" approach to packing each row's
655  // entries. All the column indices (as global indices) go first,
656  // then all their owning process ranks, and then the values.
657  if (totalNumEntries > 0) {
658  if (static_cast<size_t> (exports.dimension_0 ()) !=
659  static_cast<size_t> (totalNumBytes)) {
660  // FIXME (26 Apr 2016) Fences around (UVM) allocations only
661  // temporarily needed for #227 debugging. Should be able to
662  // remove them after that's fixed.
663  execution_space::fence ();
664  exports = Kokkos::DualView<char*, device_type> ("exports", totalNumBytes);
665  execution_space::fence ();
666  }
667 
668  // mfh 26 Apr 2016: The code below currently fills on host. We
669  // may change this in the future.
670  exports.template modify<Kokkos::HostSpace> ();
671 
672  // FIXME (mfh 28 Apr 2016) We take subviews in a loop, so it might
673  // pay to use an unmanaged View to reduce reference count update
674  // overhead. On the other hand, with CudaUVMSpace, it's not
675  // obvious what the type of "the unmanaged version of exportsK"
676  // should be. It's memory space is not Kokkos::HostSpace, for
677  // example! What we really need is a "create_unmanaged_view"
678  // function that works like Kokkos::Compat::create_const_view.
679  // For now, I'll just use the managed View.
680 
681  //auto exportsK_managed = exports.template view<Kokkos::HostSpace> ();
682  //View<char*, Kokkos::HostSpace, MemoryUnmanaged> exportsK = exportsK_managed;
683 
684  auto exports_h = exports.template view<Kokkos::HostSpace> ();
685 
686  // Current position (in bytes) in the 'exports' output array.
687  size_t offset = 0;
688 
689  // For each row of the matrix owned by the calling process, pack
690  // that row's column indices and values into the exports array.
691 
692  // Locally indexed matrices always have a column Map.
693  const map_type& colMap = * (SourceMatrix.getColMap ());
694 
695  // Temporary buffers for a copy of the column gids/pids
696  typename View<GO*, device_type>::HostMirror gids;
697  typename View<int*, device_type>::HostMirror pids;
698  {
699  GO gid;
700  int pid;
701  gids = PackTraits<GO, HDS>::allocateArray (gid, maxRowLength, "gids");
702  pids = PackTraits<int, HDS>::allocateArray (pid, maxRowLength, "pids");
703  }
704 
705  for (size_type i = 0; i < numExportLIDs; i++) {
706  const LO lclRow = exportLIDs[i];
707 
708  // Get a locally indexed view of the current row's data.
709  ArrayView<const Scalar> valsView;
710  ArrayView<const LO> lidsView;
711  SourceMatrix.getLocalRowView (lclRow, lidsView, valsView);
712  const ST* valsViewRaw = reinterpret_cast<const ST*> (valsView.getRawPtr ());
713 
714  // This is just a shallow copy if not CUDA, else a deep copy
715  // (into HDS, namely Device<Cuda, CudaUVMSpace>) if CUDA.
716  //
717  // FIXME (mfh 28 Apr 2016) This is slow for the CUDA case, but
718  // it should be correct.
719  auto valsViewK = get1DConstViewOfUnmanagedHostArray<ST, HDS> ("valsViewK", valsViewRaw, static_cast<size_t> (valsView.size ()));
720  const size_t numEnt = static_cast<size_t> (valsViewK.dimension_0 ());
721 
722  // NOTE (mfh 07 Feb 2015) Since we're using the host memory
723  // space here for now, this doesn't assume UVM. That may change
724  // in the future, if we ever start packing on the device.
725  //
726  // FIXME (mfh 28 Apr 2016) For now, we assume that the value
727  // returned by packValueCount is independent of the memory
728  // space. This assumption helps with #227.
729  const size_t numBytesPerValue = numEnt == 0 ?
730  static_cast<size_t> (0) :
731  PackTraits<ST, Kokkos::HostSpace>::packValueCount (valsViewK(0));
732 
733  // Convert column indices as LIDs to column indices as GIDs.
734  auto gidsView = subview (gids, pair_type (0, numEnt));
735  auto pidsView = subview (pids, pair_type (0, numEnt));
736  for (size_t k = 0; k < numEnt; ++k) {
737  gidsView(k) = colMap.getGlobalElement (lidsView[k]);
738  pidsView(k) = SourcePids[lidsView[k]];
739  }
740 
741  // Copy the row's data into the current spot in the exports array.
742  const size_t numBytes =
743  packRow<ST, LO, GO, HDS> (exports_h, offset, numEnt,
744  gidsView, pidsView, valsViewK,
745  numBytesPerValue);
746  // Keep track of how many bytes we packed.
747  offset += numBytes;
748  }
749 
750 #ifdef HAVE_TPETRA_DEBUG
751  TEUCHOS_TEST_FOR_EXCEPTION(
752  offset != totalNumBytes, std::logic_error, prefix << "At end of method, "
753  "the final offset (in bytes) " << offset << " does not equal the total "
754  "number of bytes packed " << totalNumBytes << ". Please report this bug "
755  "to the Tpetra developers.");
756 #endif // HAVE_TPETRA_DEBUG
757  }
758 }
759 
760 template<typename Scalar, typename LocalOrdinal, typename GlobalOrdinal, typename Node>
761 size_t
763  const Teuchos::ArrayView<const LocalOrdinal> &importLIDs,
764  const Teuchos::ArrayView<const char> &imports,
765  const Teuchos::ArrayView<const size_t>& numPacketsPerLID,
766  size_t constantNumPackets,
767  Distributor &distor,
768  CombineMode combineMode,
769  size_t numSameIDs,
770  const Teuchos::ArrayView<const LocalOrdinal>& permuteToLIDs,
771  const Teuchos::ArrayView<const LocalOrdinal>& permuteFromLIDs)
772 {
773  using Kokkos::MemoryUnmanaged;
774  using Kokkos::View;
775  typedef LocalOrdinal LO;
776  typedef GlobalOrdinal GO;
777  typedef CrsMatrix<Scalar, LO, GO, Node> matrix_type;
778  typedef typename matrix_type::impl_scalar_type ST;
779  typedef typename Teuchos::ArrayView<const LO>::size_type size_type;
780  typedef typename Node::execution_space execution_space;
781  typedef typename GetHostExecSpace<execution_space>::host_execution_space HES;
782  const char prefix[] = "unpackAndCombineWithOwningPIDsCount: ";
783 
784  TEUCHOS_TEST_FOR_EXCEPTION(
785  permuteToLIDs.size () != permuteFromLIDs.size (), std::invalid_argument,
786  prefix << "permuteToLIDs.size() = " << permuteToLIDs.size () << " != "
787  "permuteFromLIDs.size() = " << permuteFromLIDs.size() << ".");
788  // FIXME (mfh 26 Jan 2015) If there are no entries on the calling
789  // process, then the matrix is neither locally nor globally indexed.
790  const bool locallyIndexed = SourceMatrix.isLocallyIndexed ();
791  TEUCHOS_TEST_FOR_EXCEPTION(
792  ! locallyIndexed, std::invalid_argument, prefix << "The input CrsMatrix "
793  "'SourceMatrix' must be locally indexed.");
794  TEUCHOS_TEST_FOR_EXCEPTION(
795  importLIDs.size () != numPacketsPerLID.size (), std::invalid_argument,
796  prefix << "importLIDs.size() = " << importLIDs.size () << " != "
797  "numPacketsPerLID.size() = " << numPacketsPerLID.size () << ".");
798 
799  View<const char*, HES, MemoryUnmanaged> importsK (imports.getRawPtr (),
800  imports.size ());
801 
802  // Number of matrix entries to unpack (returned by this function).
803  size_t nnz = 0;
804 
805  // Count entries copied directly from the source matrix without permuting.
806  for (size_t sourceLID = 0; sourceLID < numSameIDs; ++sourceLID) {
807  const LO srcLID = static_cast<LO> (sourceLID);
808  nnz += SourceMatrix.getNumEntriesInLocalRow (srcLID);
809  }
810 
811  // Count entries copied directly from the source matrix with permuting.
812  const size_type numPermuteToLIDs = permuteToLIDs.size ();
813  for (size_type p = 0; p < numPermuteToLIDs; ++p) {
814  nnz += SourceMatrix.getNumEntriesInLocalRow (permuteFromLIDs[p]);
815  }
816 
817  // Count entries received from other MPI processes.
818  size_t offset = 0;
819  const size_type numImportLIDs = importLIDs.size ();
820  for (size_type i = 0; i < numImportLIDs; ++i) {
821  const size_t numBytes = numPacketsPerLID[i];
822  // FIXME (mfh 07 Feb 2015) Ask the matrix (rather, one of its
823  // values, if it has one) for the (possibly run-time-depenendent)
824  // number of bytes of one of its entries.
825  const size_t numEnt = unpackRowCount<LO, HES> (importsK, offset,
826  numBytes, sizeof (ST));
827  nnz += numEnt;
828  offset += numBytes;
829  }
830  return nnz;
831 }
832 
833 template<typename Scalar, typename LocalOrdinal, typename GlobalOrdinal, typename Node>
834 void
836  const Teuchos::ArrayView<const LocalOrdinal>& importLIDs,
837  const Teuchos::ArrayView<const char>& imports,
838  const Teuchos::ArrayView<const size_t>& numPacketsPerLID,
839  const size_t constantNumPackets,
840  Distributor& distor,
841  const CombineMode combineMode,
842  const size_t numSameIDs,
843  const Teuchos::ArrayView<const LocalOrdinal>& permuteToLIDs,
844  const Teuchos::ArrayView<const LocalOrdinal>& permuteFromLIDs,
845  size_t TargetNumRows,
846  size_t TargetNumNonzeros,
847  const int MyTargetPID,
848  const Teuchos::ArrayView<size_t>& CSR_rowptr,
849  const Teuchos::ArrayView<GlobalOrdinal>& CSR_colind,
850  const Teuchos::ArrayView<Scalar>& CSR_vals,
851  const Teuchos::ArrayView<const int>& SourcePids,
852  Teuchos::Array<int>& TargetPids)
853 {
855  using Kokkos::MemoryUnmanaged;
856  using Kokkos::subview;
857  using Kokkos::View;
858  using Teuchos::ArrayRCP;
859  using Teuchos::ArrayView;
860  using Teuchos::as;
861  using Teuchos::av_reinterpret_cast;
862  using Teuchos::outArg;
863  using Teuchos::REDUCE_MAX;
864  using Teuchos::reduceAll;
865  typedef LocalOrdinal LO;
866  typedef GlobalOrdinal GO;
867  typedef typename Node::execution_space execution_space;
868  typedef typename GetHostExecSpace<execution_space>::host_execution_space HES;
870  typedef typename matrix_type::impl_scalar_type ST;
871  typedef Map<LocalOrdinal,GlobalOrdinal,Node> map_type;
872  typedef typename ArrayView<const LO>::size_type size_type;
873  //typedef std::pair<typename Kokkos::View<int*, HES>::size_type,
874  // typename Kokkos::View<int*, HES>::size_type> pair_type;
875  const char prefix[] = "Tpetra::Import_Util::unpackAndCombineIntoCrsArrays: ";
876 
877  const size_t N = TargetNumRows;
878  const size_t mynnz = TargetNumNonzeros;
879  // In the case of reduced communicators, the SourceMatrix won't have
880  // the right "MyPID", so thus we have to supply it.
881  const int MyPID = MyTargetPID;
882 
883  TEUCHOS_TEST_FOR_EXCEPTION(
884  TargetNumRows + 1 != static_cast<size_t> (CSR_rowptr.size ()),
885  std::invalid_argument, prefix << "CSR_rowptr.size() = " <<
886  CSR_rowptr.size () << "!= TargetNumRows+1 = " << TargetNumRows+1 << ".");
887  TEUCHOS_TEST_FOR_EXCEPTION(
888  permuteToLIDs.size () != permuteFromLIDs.size (), std::invalid_argument,
889  prefix << "permuteToLIDs.size() = " << permuteToLIDs.size ()
890  << "!= permuteFromLIDs.size() = " << permuteFromLIDs.size () << ".");
891  const size_type numImportLIDs = importLIDs.size ();
892  TEUCHOS_TEST_FOR_EXCEPTION(
893  numImportLIDs != numPacketsPerLID.size (), std::invalid_argument,
894  prefix << "importLIDs.size() = " << numImportLIDs << " != "
895  "numPacketsPerLID.size() = " << numPacketsPerLID.size() << ".");
896 
897  // Kokkos::View of the input buffer of bytes to unpack.
898  View<const char*, HES, MemoryUnmanaged> importsK (imports.getRawPtr (),
899  imports.size ());
900  // Zero the rowptr
901  for (size_t i = 0; i< N+1; ++i) {
902  CSR_rowptr[i] = 0;
903  }
904 
905  // SameIDs: Always first, always in the same place
906  for (size_t i = 0; i < numSameIDs; ++i) {
907  CSR_rowptr[i] = SourceMatrix.getNumEntriesInLocalRow (static_cast<LO> (i));
908  }
909 
910  // PermuteIDs: Still local, but reordered
911  size_t numPermuteIDs = permuteToLIDs.size ();
912  for (size_t i = 0; i < numPermuteIDs; ++i) {
913  CSR_rowptr[permuteToLIDs[i]] =
914  SourceMatrix.getNumEntriesInLocalRow (permuteFromLIDs[i]);
915  }
916 
917  // Setup CSR_rowptr for remotes
918  {
919  size_t offset = 0;
920  for (size_type k = 0; k < numImportLIDs; ++k) {
921  const size_t numBytes = numPacketsPerLID[k];
922  // FIXME (mfh 07 Feb 2015) Ask the matrix (rather, one of its
923  // values, if it has one) for the (possibly run-time -
924  // depenendent) number of bytes of one of its entries.
925  const size_t numEnt = unpackRowCount<LO, HES> (importsK, offset,
926  numBytes, sizeof (ST));
927  CSR_rowptr[importLIDs[k]] += numEnt;
928  offset += numBytes;
929  }
930  }
931 
932  // If multiple processes contribute to the same row, we may need to
933  // update row offsets. This tracks that.
934  Teuchos::Array<size_t> NewStartRow (N + 1);
935 
936  // Turn row length into a real CSR_rowptr
937  size_t last_len = CSR_rowptr[0];
938  CSR_rowptr[0] = 0;
939  for (size_t i = 1; i < N+1; ++i) {
940  size_t new_len = CSR_rowptr[i];
941  CSR_rowptr[i] = last_len + CSR_rowptr[i-1];
942  NewStartRow[i] = CSR_rowptr[i];
943  last_len = new_len;
944  }
945 
946  TEUCHOS_TEST_FOR_EXCEPTION(
947  CSR_rowptr[N] != mynnz, std::invalid_argument, prefix << "CSR_rowptr[last]"
948  " = " << CSR_rowptr[N] << "!= mynnz = " << mynnz << ".");
949 
950  // Preseed TargetPids with -1 for local
951  if (static_cast<size_t> (TargetPids.size ()) != mynnz) {
952  TargetPids.resize (mynnz);
953  }
954  TargetPids.assign (mynnz, -1);
955 
956  // Grab pointers for SourceMatrix
957  ArrayRCP<const size_t> Source_rowptr_RCP;
958  ArrayRCP<const LO> Source_colind_RCP;
959  ArrayRCP<const Scalar> Source_vals_RCP;
960  SourceMatrix.getAllValues (Source_rowptr_RCP, Source_colind_RCP, Source_vals_RCP);
961  ArrayView<const size_t> Source_rowptr = Source_rowptr_RCP ();
962  ArrayView<const LO> Source_colind = Source_colind_RCP ();
963  ArrayView<const Scalar> Source_vals = Source_vals_RCP ();
964 
965  const map_type& sourceColMap = * (SourceMatrix.getColMap());
966  ArrayView<const GO> globalColElts = sourceColMap.getNodeElementList();
967 
968  // SameIDs: Copy the data over
969  for (size_t i = 0; i < numSameIDs; ++i) {
970  size_t FromRow = Source_rowptr[i];
971  size_t ToRow = CSR_rowptr[i];
972  NewStartRow[i] += Source_rowptr[i+1] - Source_rowptr[i];
973 
974  for (size_t j = Source_rowptr[i]; j < Source_rowptr[i+1]; ++j) {
975  CSR_vals[ToRow + j - FromRow] = Source_vals[j];
976  CSR_colind[ToRow + j - FromRow] = globalColElts[Source_colind[j]];
977  TargetPids[ToRow + j - FromRow] =
978  (SourcePids[Source_colind[j]] != MyPID) ?
979  SourcePids[Source_colind[j]] : -1;
980  }
981  }
982 
983  size_t numBytesPerValue = 0;
984  if (PackTraits<ST, HES>::compileTimeSize) {
985  ST val; // assume that ST is default constructible
986  numBytesPerValue = PackTraits<ST, HES>::packValueCount (val);
987  }
988  else {
989  // Since the packed data come from the source matrix, we can use
990  // the source matrix to get the number of bytes per Scalar value
991  // stored in the matrix. This assumes that all Scalar values in
992  // the source matrix require the same number of bytes. If the
993  // source matrix has no entries on the calling process, then we
994  // have to ask the target matrix (via the output CSR arrays). If
995  // the target matrix has no entries on input on the calling
996  // process, then hope that some process does have some idea how
997  // big a Scalar value is. Of course, if no processes have any
998  // entries, then no values should be packed (though this does
999  // assume that in our packing scheme, rows with zero entries take
1000  // zero bytes).
1001  if (Source_rowptr.size () == 0 || Source_rowptr[Source_rowptr.size () - 1] == 0) {
1002  numBytesPerValue = PackTraits<ST, HES>::packValueCount (CSR_vals[0]);
1003  }
1004  else {
1005  numBytesPerValue = PackTraits<ST, HES>::packValueCount (Source_vals[0]);
1006  }
1007  size_t lclNumBytesPerValue = numBytesPerValue;
1008  Teuchos::RCP<const Teuchos::Comm<int> > comm = SourceMatrix.getComm ();
1009  reduceAll<int, size_t> (*comm, REDUCE_MAX, lclNumBytesPerValue,
1010  outArg (numBytesPerValue));
1011  }
1012 
1013  // PermuteIDs: Copy the data over
1014  for (size_t i = 0; i < numPermuteIDs; ++i) {
1015  LO FromLID = permuteFromLIDs[i];
1016  size_t FromRow = Source_rowptr[FromLID];
1017  size_t ToRow = CSR_rowptr[permuteToLIDs[i]];
1018 
1019  NewStartRow[permuteToLIDs[i]] += Source_rowptr[FromLID+1]-Source_rowptr[FromLID];
1020 
1021  for (size_t j = Source_rowptr[FromLID]; j < Source_rowptr[FromLID+1]; ++j) {
1022  CSR_vals[ToRow + j - FromRow] = Source_vals[j];
1023  CSR_colind[ToRow + j - FromRow] = globalColElts[Source_colind[j]];
1024  TargetPids[ToRow + j - FromRow] =
1025  (SourcePids[Source_colind[j]] != MyPID) ?
1026  SourcePids[Source_colind[j]] : -1;
1027  }
1028  }
1029 
1030  // RemoteIDs: Loop structure following UnpackAndCombine
1031  if (imports.size () > 0) {
1032  size_t offset = 0;
1033 #ifdef HAVE_TPETRA_DEBUG
1034  int lclErr = 0;
1035 #endif
1036 
1037  for (size_t i = 0; i < static_cast<size_t> (numImportLIDs); ++i) {
1038  const size_t numBytes = numPacketsPerLID[i];
1039  if (numBytes == 0) {
1040  // Empty buffer for that row means that the row is empty.
1041  continue;
1042  }
1043  // FIXME (mfh 07 Feb 2015) Ask the matrix (rather, one of its
1044  // values, if it has one) for the (possibly run-time -
1045  // depenendent) number of bytes of one of its entries.
1046  const size_t numEnt = unpackRowCount<LO, HES> (importsK, offset, numBytes,
1047  numBytesPerValue);
1048  const LO lclRow = importLIDs[i];
1049  const size_t StartRow = NewStartRow[lclRow];
1050  NewStartRow[lclRow] += numEnt;
1051 
1052  View<GO*, HES, MemoryUnmanaged> gidsOut =
1053  getNonconstView<GO, HES> (CSR_colind (StartRow, numEnt));
1054  View<int*, HES, MemoryUnmanaged> pidsOut =
1055  getNonconstView<int, HES> (TargetPids (StartRow, numEnt));
1056  ArrayView<Scalar> valsOutS = CSR_vals (StartRow, numEnt);
1057  View<ST*, HES, MemoryUnmanaged> valsOut =
1058  getNonconstView<ST, HES> (av_reinterpret_cast<ST> (valsOutS));
1059 
1060  const size_t numBytesOut =
1061  unpackRow<ST, LO, GO, HES> (gidsOut, pidsOut, valsOut, importsK,
1062  offset, numBytes, numEnt, numBytesPerValue);
1063  if (numBytesOut != numBytes) {
1064 #ifdef HAVE_TPETRA_DEBUG
1065  lclErr = 1;
1066 #endif
1067  break;
1068  }
1069 
1070  // Correct target PIDs.
1071  for (size_t j = 0; j < numEnt; ++j) {
1072  const int pid = pidsOut[j];
1073  pidsOut[j] = (pid != MyPID) ? pid : -1;
1074  }
1075  offset += numBytes;
1076  }
1077 #ifdef HAVE_TPETRA_DEBUG
1078  TEUCHOS_TEST_FOR_EXCEPTION(
1079  offset != static_cast<size_t> (imports.size ()), std::logic_error, prefix
1080  << "After unpacking and counting all the import packets, the final offset"
1081  " in bytes " << offset << " != imports.size() = " << imports.size () <<
1082  ". Please report this bug to the Tpetra developers.");
1083  TEUCHOS_TEST_FOR_EXCEPTION(
1084  lclErr != 0, std::logic_error, prefix << "numBytes != numBytesOut "
1085  "somewhere in unpack loop. This should never happen. "
1086  "Please report this bug to the Tpetra developers.");
1087 #endif // HAVE_TPETRA_DEBUG
1088  }
1089 }
1090 
1091 // Note: This should get merged with the other Tpetra sort routines eventually.
1092 template<typename Scalar, typename Ordinal>
1093 void
1094 sortCrsEntries (const Teuchos::ArrayView<size_t> &CRS_rowptr,
1095  const Teuchos::ArrayView<Ordinal> & CRS_colind,
1096  const Teuchos::ArrayView<Scalar> &CRS_vals)
1097 {
1098  // For each row, sort column entries from smallest to largest.
1099  // Use shell sort. Stable sort so it is fast if indices are already sorted.
1100  // Code copied from Epetra_CrsMatrix::SortEntries()
1101  size_t NumRows = CRS_rowptr.size()-1;
1102  size_t nnz = CRS_colind.size();
1103 
1104  for(size_t i = 0; i < NumRows; i++){
1105  size_t start=CRS_rowptr[i];
1106  if(start >= nnz) continue;
1107 
1108  Scalar* locValues = &CRS_vals[start];
1109  size_t NumEntries = CRS_rowptr[i+1] - start;
1110  Ordinal* locIndices = &CRS_colind[start];
1111 
1112  Ordinal n = NumEntries;
1113  Ordinal m = n/2;
1114 
1115  while(m > 0) {
1116  Ordinal max = n - m;
1117  for(Ordinal j = 0; j < max; j++) {
1118  for(Ordinal k = j; k >= 0; k-=m) {
1119  if(locIndices[k+m] >= locIndices[k])
1120  break;
1121  Scalar dtemp = locValues[k+m];
1122  locValues[k+m] = locValues[k];
1123  locValues[k] = dtemp;
1124  Ordinal itemp = locIndices[k+m];
1125  locIndices[k+m] = locIndices[k];
1126  locIndices[k] = itemp;
1127  }
1128  }
1129  m = m/2;
1130  }
1131  }
1132 }
1133 
1134 // Note: This should get merged with the other Tpetra sort routines eventually.
1135 template<typename Scalar, typename Ordinal>
1136 void
1137 sortAndMergeCrsEntries (const Teuchos::ArrayView<size_t> &CRS_rowptr,
1138  const Teuchos::ArrayView<Ordinal> & CRS_colind,
1139  const Teuchos::ArrayView<Scalar> &CRS_vals)
1140 {
1141  // For each row, sort column entries from smallest to largest,
1142  // merging column ids that are identify by adding values. Use shell
1143  // sort. Stable sort so it is fast if indices are already sorted.
1144  // Code copied from Epetra_CrsMatrix::SortEntries()
1145 
1146  size_t NumRows = CRS_rowptr.size()-1;
1147  size_t nnz = CRS_colind.size();
1148  size_t new_curr=CRS_rowptr[0], old_curr=CRS_rowptr[0];
1149 
1150  for(size_t i = 0; i < NumRows; i++){
1151  size_t start=CRS_rowptr[i];
1152  if(start >= nnz) continue;
1153 
1154  Scalar* locValues = &CRS_vals[start];
1155  size_t NumEntries = CRS_rowptr[i+1] - start;
1156  Ordinal* locIndices = &CRS_colind[start];
1157 
1158  // Sort phase
1159  Ordinal n = NumEntries;
1160  Ordinal m = n/2;
1161 
1162  while(m > 0) {
1163  Ordinal max = n - m;
1164  for(Ordinal j = 0; j < max; j++) {
1165  for(Ordinal k = j; k >= 0; k-=m) {
1166  if(locIndices[k+m] >= locIndices[k])
1167  break;
1168  Scalar dtemp = locValues[k+m];
1169  locValues[k+m] = locValues[k];
1170  locValues[k] = dtemp;
1171  Ordinal itemp = locIndices[k+m];
1172  locIndices[k+m] = locIndices[k];
1173  locIndices[k] = itemp;
1174  }
1175  }
1176  m = m/2;
1177  }
1178 
1179  // Merge & shrink
1180  for(size_t j=CRS_rowptr[i]; j < CRS_rowptr[i+1]; j++) {
1181  if(j > CRS_rowptr[i] && CRS_colind[j]==CRS_colind[new_curr-1]) {
1182  CRS_vals[new_curr-1] += CRS_vals[j];
1183  }
1184  else if(new_curr==j) {
1185  new_curr++;
1186  }
1187  else {
1188  CRS_colind[new_curr] = CRS_colind[j];
1189  CRS_vals[new_curr] = CRS_vals[j];
1190  new_curr++;
1191  }
1192  }
1193 
1194  CRS_rowptr[i] = old_curr;
1195  old_curr=new_curr;
1196  }
1197 
1198  CRS_rowptr[NumRows] = new_curr;
1199 }
1200 
1201 template <typename LocalOrdinal, typename GlobalOrdinal, typename Node>
1202 void
1203 lowCommunicationMakeColMapAndReindex (const Teuchos::ArrayView<const size_t> &rowptr,
1204  const Teuchos::ArrayView<LocalOrdinal> &colind_LID,
1205  const Teuchos::ArrayView<GlobalOrdinal> &colind_GID,
1206  const Teuchos::RCP<const Tpetra::Map<LocalOrdinal,GlobalOrdinal,Node> >& domainMapRCP,
1207  const Teuchos::ArrayView<const int> &owningPIDs,
1208  Teuchos::Array<int> &remotePIDs,
1209  Teuchos::RCP<const Tpetra::Map<LocalOrdinal,GlobalOrdinal,Node> > & colMap)
1210 {
1211  using Teuchos::rcp;
1212  typedef LocalOrdinal LO;
1213  typedef GlobalOrdinal GO;
1214  typedef Tpetra::global_size_t GST;
1215  typedef Tpetra::Map<LO, GO, Node> map_type;
1216  const char prefix[] = "lowCommunicationMakeColMapAndReindex: ";
1217 
1218  // The domainMap is an RCP because there is a shortcut for a
1219  // (common) special case to return the columnMap = domainMap.
1220  const map_type& domainMap = *domainMapRCP;
1221 
1222  // Scan all column indices and sort into two groups:
1223  // Local: those whose GID matches a GID of the domain map on this processor and
1224  // Remote: All others.
1225  const size_t numDomainElements = domainMap.getNodeNumElements ();
1226  Teuchos::Array<bool> LocalGIDs;
1227  if (numDomainElements > 0) {
1228  LocalGIDs.resize (numDomainElements, false); // Assume domain GIDs are not local
1229  }
1230 
1231  // In principle it is good to have RemoteGIDs and RemotGIDList be as
1232  // long as the number of remote GIDs on this processor, but this
1233  // would require two passes through the column IDs, so we make it
1234  // the max of 100 and the number of block rows.
1235  //
1236  // FIXME (mfh 11 Feb 2015) Tpetra::Details::HashTable can hold at
1237  // most INT_MAX entries, but it's possible to have more rows than
1238  // that (if size_t is 64 bits and int is 32 bits).
1239  const size_t numMyRows = rowptr.size () - 1;
1240  const int hashsize = std::max (static_cast<int> (numMyRows), 100);
1241 
1242  Tpetra::Details::HashTable<GO, LO> RemoteGIDs (hashsize);
1243  Teuchos::Array<GO> RemoteGIDList;
1244  RemoteGIDList.reserve (hashsize);
1245  Teuchos::Array<int> PIDList;
1246  PIDList.reserve (hashsize);
1247 
1248  // Here we start using the *LocalOrdinal* colind_LID array. This is
1249  // safe even if both columnIndices arrays are actually the same
1250  // (because LocalOrdinal==GO). For *local* GID's set
1251  // colind_LID with with their LID in the domainMap. For *remote*
1252  // GIDs, we set colind_LID with (numDomainElements+NumRemoteColGIDs)
1253  // before the increment of the remote count. These numberings will
1254  // be separate because no local LID is greater than
1255  // numDomainElements.
1256 
1257  size_t NumLocalColGIDs = 0;
1258  LO NumRemoteColGIDs = 0;
1259  for (size_t i = 0; i < numMyRows; ++i) {
1260  for(size_t j = rowptr[i]; j < rowptr[i+1]; ++j) {
1261  const GO GID = colind_GID[j];
1262  // Check if GID matches a row GID
1263  const LO LID = domainMap.getLocalElement (GID);
1264  if(LID != -1) {
1265  const bool alreadyFound = LocalGIDs[LID];
1266  if (! alreadyFound) {
1267  LocalGIDs[LID] = true; // There is a column in the graph associated with this domain map GID
1268  NumLocalColGIDs++;
1269  }
1270  colind_LID[j] = LID;
1271  }
1272  else {
1273  const LO hash_value = RemoteGIDs.get (GID);
1274  if (hash_value == -1) { // This means its a new remote GID
1275  const int PID = owningPIDs[j];
1276  TEUCHOS_TEST_FOR_EXCEPTION(
1277  PID == -1, std::invalid_argument, prefix << "Cannot figure out if "
1278  "PID is owned.");
1279  colind_LID[j] = static_cast<LO> (numDomainElements + NumRemoteColGIDs);
1280  RemoteGIDs.add (GID, NumRemoteColGIDs);
1281  RemoteGIDList.push_back (GID);
1282  PIDList.push_back (PID);
1283  NumRemoteColGIDs++;
1284  }
1285  else {
1286  colind_LID[j] = static_cast<LO> (numDomainElements + hash_value);
1287  }
1288  }
1289  }
1290  }
1291 
1292  // Possible short-circuit: If all domain map GIDs are present as
1293  // column indices, then set ColMap=domainMap and quit.
1294  if (domainMap.getComm ()->getSize () == 1) {
1295  // Sanity check: When there is only one process, there can be no
1296  // remoteGIDs.
1297  TEUCHOS_TEST_FOR_EXCEPTION(
1298  NumRemoteColGIDs != 0, std::runtime_error, prefix << "There is only one "
1299  "process in the domain Map's communicator, which means that there are no "
1300  "\"remote\" indices. Nevertheless, some column indices are not in the "
1301  "domain Map.");
1302  if (static_cast<size_t> (NumLocalColGIDs) == numDomainElements) {
1303  // In this case, we just use the domainMap's indices, which is,
1304  // not coincidently, what we clobbered colind with up above
1305  // anyway. No further reindexing is needed.
1306  colMap = domainMapRCP;
1307  return;
1308  }
1309  }
1310 
1311  // Now build the array containing column GIDs
1312  // Build back end, containing remote GIDs, first
1313  const LO numMyCols = NumLocalColGIDs + NumRemoteColGIDs;
1314  Teuchos::Array<GO> ColIndices;
1315  GO* RemoteColIndices = NULL;
1316  if (numMyCols > 0) {
1317  ColIndices.resize (numMyCols);
1318  if (NumLocalColGIDs != static_cast<size_t> (numMyCols)) {
1319  RemoteColIndices = &ColIndices[NumLocalColGIDs]; // Points to back half of ColIndices
1320  }
1321  }
1322 
1323  for (LO i = 0; i < NumRemoteColGIDs; ++i) {
1324  RemoteColIndices[i] = RemoteGIDList[i];
1325  }
1326 
1327  // Build permute array for *remote* reindexing.
1328  Teuchos::Array<LO> RemotePermuteIDs (NumRemoteColGIDs);
1329  for (LO i = 0; i < NumRemoteColGIDs; ++i) {
1330  RemotePermuteIDs[i]=i;
1331  }
1332 
1333  // Sort External column indices so that all columns coming from a
1334  // given remote processor are contiguous. This is a sort with two
1335  // auxillary arrays: RemoteColIndices and RemotePermuteIDs.
1336  Tpetra::sort3 (PIDList.begin (), PIDList.end (),
1337  ColIndices.begin () + NumLocalColGIDs,
1338  RemotePermuteIDs.begin ());
1339 
1340  // Stash the RemotePIDs.
1341  //
1342  // Note: If Teuchos::Array had a shrink_to_fit like std::vector,
1343  // we'd call it here.
1344  remotePIDs = PIDList;
1345 
1346  // Sort external column indices so that columns from a given remote
1347  // processor are not only contiguous but also in ascending
1348  // order. NOTE: I don't know if the number of externals associated
1349  // with a given remote processor is known at this point ... so I
1350  // count them here.
1351 
1352  // NTS: Only sort the RemoteColIndices this time...
1353  LO StartCurrent = 0, StartNext = 1;
1354  while (StartNext < NumRemoteColGIDs) {
1355  if (PIDList[StartNext]==PIDList[StartNext-1]) {
1356  StartNext++;
1357  }
1358  else {
1359  Tpetra::sort2 (ColIndices.begin () + NumLocalColGIDs + StartCurrent,
1360  ColIndices.begin () + NumLocalColGIDs + StartNext,
1361  RemotePermuteIDs.begin () + StartCurrent);
1362  StartCurrent = StartNext;
1363  StartNext++;
1364  }
1365  }
1366  Tpetra::sort2 (ColIndices.begin () + NumLocalColGIDs + StartCurrent,
1367  ColIndices.begin () + NumLocalColGIDs + StartNext,
1368  RemotePermuteIDs.begin () + StartCurrent);
1369 
1370  // Reverse the permutation to get the information we actually care about
1371  Teuchos::Array<LO> ReverseRemotePermuteIDs (NumRemoteColGIDs);
1372  for (LO i = 0; i < NumRemoteColGIDs; ++i) {
1373  ReverseRemotePermuteIDs[RemotePermuteIDs[i]] = i;
1374  }
1375 
1376  // Build permute array for *local* reindexing.
1377  bool use_local_permute = false;
1378  Teuchos::Array<LO> LocalPermuteIDs (numDomainElements);
1379 
1380  // Now fill front end. Two cases:
1381  //
1382  // (1) If the number of Local column GIDs is the same as the number
1383  // of Local domain GIDs, we can simply read the domain GIDs into
1384  // the front part of ColIndices, otherwise
1385  //
1386  // (2) We step through the GIDs of the domainMap, checking to see if
1387  // each domain GID is a column GID. we want to do this to
1388  // maintain a consistent ordering of GIDs between the columns
1389  // and the domain.
1390  Teuchos::ArrayView<const GO> domainGlobalElements = domainMap.getNodeElementList();
1391  if (static_cast<size_t> (NumLocalColGIDs) == numDomainElements) {
1392  if (NumLocalColGIDs > 0) {
1393  // Load Global Indices into first numMyCols elements column GID list
1394  std::copy (domainGlobalElements.begin (), domainGlobalElements.end (),
1395  ColIndices.begin ());
1396  }
1397  }
1398  else {
1399  LO NumLocalAgain = 0;
1400  use_local_permute = true;
1401  for (size_t i = 0; i < numDomainElements; ++i) {
1402  if (LocalGIDs[i]) {
1403  LocalPermuteIDs[i] = NumLocalAgain;
1404  ColIndices[NumLocalAgain++] = domainGlobalElements[i];
1405  }
1406  }
1407  TEUCHOS_TEST_FOR_EXCEPTION(
1408  static_cast<size_t> (NumLocalAgain) != NumLocalColGIDs,
1409  std::runtime_error, prefix << "Local ID count test failed.");
1410  }
1411 
1412  // Make column Map
1413  const GST minus_one = Teuchos::OrdinalTraits<GST>::invalid ();
1414  colMap = rcp (new map_type (minus_one, ColIndices, domainMap.getIndexBase (),
1415  domainMap.getComm (), domainMap.getNode ()));
1416 
1417  // Low-cost reindex of the matrix
1418  for (size_t i = 0; i < numMyRows; ++i) {
1419  for (size_t j = rowptr[i]; j < rowptr[i+1]; ++j) {
1420  const LO ID = colind_LID[j];
1421  if (static_cast<size_t> (ID) < numDomainElements) {
1422  if (use_local_permute) {
1423  colind_LID[j] = LocalPermuteIDs[colind_LID[j]];
1424  }
1425  // In the case where use_local_permute==false, we just copy
1426  // the DomainMap's ordering, which it so happens is what we
1427  // put in colind_LID to begin with.
1428  }
1429  else {
1430  colind_LID[j] = NumLocalColGIDs + ReverseRemotePermuteIDs[colind_LID[j]-numDomainElements];
1431  }
1432  }
1433  }
1434 }
1435 
1436 } // namespace Import_Util
1437 } // namespace Tpetra
1438 
1439 // We can include the definitions for Tpetra::CrsMatrix now that the above
1440 // functions have been defined. For ETI, this isn't necessary, so we just
1441 // including the generated hpp
1442 #include "Tpetra_CrsMatrix.hpp"
1443 
1444 #endif // TPETRA_IMPORT_UTIL_HPP
void sortAndMergeCrsEntries(const Teuchos::ArrayView< size_t > &CRS_rowptr, const Teuchos::ArrayView< Ordinal > &CRS_colind, const Teuchos::ArrayView< Scalar > &CRS_vals)
Sort and merge the entries of the (raw CSR) matrix by column index within each row.
Namespace Tpetra contains the class and methods constituting the Tpetra library.
Sparse matrix that presents a row-oriented interface that lets users read or modify entries...
void unpackAndCombineIntoCrsArrays(const CrsMatrix< Scalar, LocalOrdinal, GlobalOrdinal, Node > &SourceMatrix, const Teuchos::ArrayView< const LocalOrdinal > &importLIDs, const Teuchos::ArrayView< const char > &imports, const Teuchos::ArrayView< const size_t > &numPacketsPerLID, size_t constantNumPackets, Distributor &distor, CombineMode combineMode, size_t numSameIDs, const Teuchos::ArrayView< const LocalOrdinal > &permuteToLIDs, const Teuchos::ArrayView< const LocalOrdinal > &permuteFromLIDs, size_t TargetNumRows, size_t TargetNumNonzeros, int MyTargetPID, const Teuchos::ArrayView< size_t > &rowPointers, const Teuchos::ArrayView< GlobalOrdinal > &columnIndices, const Teuchos::ArrayView< Scalar > &values, const Teuchos::ArrayView< const int > &SourcePids, Teuchos::Array< int > &TargetPids)
unpackAndCombineIntoCrsArrays
void sort3(const IT1 &first1, const IT1 &last1, const IT2 &first2, const IT3 &first3)
Sort the first array, and apply the same permutation to the second and third arrays.
Traits class for packing / unpacking data of type T, using Kokkos data structures that live in the gi...
Declaration of the Tpetra::CrsMatrix class.
void deep_copy(MultiVector< DS, DL, DG, DN, dstClassic > &dst, const MultiVector< SS, SL, SG, SN, srcClassic > &src)
Copy the contents of the MultiVector src into dst.
Declaration and generic definition of traits class that tells Tpetra::CrsMatrix how to pack and unpac...
size_t unpackAndCombineWithOwningPIDsCount(const CrsMatrix< Scalar, LocalOrdinal, GlobalOrdinal, Node > &SourceMatrix, const Teuchos::ArrayView< const LocalOrdinal > &importLIDs, const Teuchos::ArrayView< const char > &imports, const Teuchos::ArrayView< const size_t > &numPacketsPerLID, size_t constantNumPackets, Distributor &distor, CombineMode combineMode, size_t numSameIDs, const Teuchos::ArrayView< const LocalOrdinal > &permuteToLIDs, const Teuchos::ArrayView< const LocalOrdinal > &permuteFromLIDs)
Special version of Tpetra::CrsMatrix::unpackAndCombine that also unpacks owning process ranks...
Kokkos::View< char *, D, Kokkos::MemoryUnmanaged > output_buffer_type
The type of an output buffer of bytes.
size_t getNodeNumElements() const
The number of elements belonging to the calling process.
void sortCrsEntries(const Teuchos::ArrayView< size_t > &CRS_rowptr, const Teuchos::ArrayView< Ordinal > &CRS_colind, const Teuchos::ArrayView< Scalar > &CRS_vals)
Sort the entries of the (raw CSR) matrix by column index within each row.
void lowCommunicationMakeColMapAndReindex(const Teuchos::ArrayView< const size_t > &rowPointers, const Teuchos::ArrayView< LocalOrdinal > &columnIndices_LID, const Teuchos::ArrayView< GlobalOrdinal > &columnIndices_GID, const Teuchos::RCP< const Tpetra::Map< LocalOrdinal, GlobalOrdinal, Node > > &domainMap, const Teuchos::ArrayView< const int > &owningPids, Teuchos::Array< int > &remotePids, Teuchos::RCP< const Tpetra::Map< LocalOrdinal, GlobalOrdinal, Node > > &colMap)
lowCommunicationMakeColMapAndReindex
size_t global_size_t
Global size_t object.
Sets up and executes a communication plan for a Tpetra DistObject.
CombineMode
Rule for combining data in an Import or Export.
Kokkos::View< const value_type *, D, Kokkos::MemoryUnmanaged > input_array_type
The type of an input array of value_type.
void packAndPrepareWithOwningPIDs(const CrsMatrix< Scalar, LocalOrdinal, GlobalOrdinal, Node > &SourceMatrix, const Teuchos::ArrayView< const LocalOrdinal > &exportLIDs, Kokkos::DualView< char *, typename Node::device_type > &exports, const Teuchos::ArrayView< size_t > &numPacketsPerLID, size_t &constantNumPackets, Distributor &distor, const Teuchos::ArrayView< const int > &SourcePids)
Special version of Tpetra::CrsMatrix::packAndPrepare that also packs owning process ranks...
void getLocalRowView(LocalOrdinal LocalRow, Teuchos::ArrayView< const LocalOrdinal > &indices, Teuchos::ArrayView< const Scalar > &values) const
Get a constant, nonpersisting view of a row of this matrix, using local row and column indices...
Kokkos::View< const char *, D, Kokkos::MemoryUnmanaged > input_buffer_type
The type of an input buffer of bytes.
size_t getNumEntriesInLocalRow(LocalOrdinal localRow) const
Returns the current number of entries on this node in the specified local row.
void sort2(const IT1 &first1, const IT1 &last1, const IT2 &first2)
Sort the first array, and apply the resulting permutation to the second array.
ValueType get(const KeyType key)
Get the value corresponding to the given key.
Describes a parallel distribution of objects over processes.
void add(const KeyType key, const ValueType value)
Add a key and its value to the hash table.
Kokkos::View< value_type *, D, Kokkos::MemoryUnmanaged > output_array_type
The type of an output array of value_type.
Stand-alone utility functions and macros.
Teuchos::RCP< const map_type > getColMap() const
The Map that describes the column distribution in this matrix.
Teuchos::RCP< const Teuchos::Comm< int > > getComm() const
The communicator over which the matrix is distributed.
bool isLocallyIndexed() const
Whether the matrix is locally indexed on the calling process.