Tpetra parallel linear algebra  Version of the Day
TpetraExt_MatrixMatrix_def.hpp
Go to the documentation of this file.
1 // @HEADER
2 // ***********************************************************************
3 //
4 // Tpetra: Templated Linear Algebra Services Package
5 // Copyright (2008) Sandia Corporation
6 //
7 // Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
8 // the U.S. Government retains certain rights in this software.
9 //
10 // Redistribution and use in source and binary forms, with or without
11 // modification, are permitted provided that the following conditions are
12 // met:
13 //
14 // 1. Redistributions of source code must retain the above copyright
15 // notice, this list of conditions and the following disclaimer.
16 //
17 // 2. Redistributions in binary form must reproduce the above copyright
18 // notice, this list of conditions and the following disclaimer in the
19 // documentation and/or other materials provided with the distribution.
20 //
21 // 3. Neither the name of the Corporation nor the names of the
22 // contributors may be used to endorse or promote products derived from
23 // this software without specific prior written permission.
24 //
25 // THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
26 // EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
27 // IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
28 // PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
29 // CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
30 // EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
31 // PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
32 // PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
33 // LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
34 // NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
35 // SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
36 //
37 // Questions? Contact Michael A. Heroux (maherou@sandia.gov)
38 //
39 // ************************************************************************
40 // @HEADER
41 #ifndef TPETRA_MATRIXMATRIX_DEF_HPP
42 #define TPETRA_MATRIXMATRIX_DEF_HPP
43 
45 #include "Teuchos_VerboseObject.hpp"
46 #include "Teuchos_Array.hpp"
47 #include "Tpetra_Util.hpp"
48 #include "Tpetra_ConfigDefs.hpp"
49 #include "Tpetra_CrsMatrix.hpp"
51 #include "Tpetra_RowMatrixTransposer.hpp"
52 #include "Tpetra_ConfigDefs.hpp"
53 #include "Tpetra_Map.hpp"
54 #include "Tpetra_Export.hpp"
55 #include "Tpetra_Import_Util.hpp"
56 #include "Tpetra_Import_Util2.hpp"
57 #include <algorithm>
58 #include "Teuchos_FancyOStream.hpp"
59 
65 namespace Tpetra {
66 
67 namespace MatrixMatrix{
68 
69 //
70 // This method forms the matrix-matrix product C = op(A) * op(B), where
71 // op(A) == A if transposeA is false,
72 // op(A) == A^T if transposeA is true,
73 // and similarly for op(B).
74 //
75 template <class Scalar,
76  class LocalOrdinal,
77  class GlobalOrdinal,
78  class Node>
79 void Multiply(
81  bool transposeA,
83  bool transposeB,
85  bool call_FillComplete_on_result,
86  const std::string& label,
87  const Teuchos::RCP<Teuchos::ParameterList>& params)
88 {
89  using Teuchos::null;
90  using Teuchos::RCP;
91  typedef Scalar SC;
92  typedef LocalOrdinal LO;
93  typedef GlobalOrdinal GO;
94  typedef Node NO;
95  typedef CrsMatrix<SC,LO,GO,NO> crs_matrix_type;
96  typedef Import<LO,GO,NO> import_type;
97  typedef CrsMatrixStruct<SC,LO,GO,NO> crs_matrix_struct_type;
98  typedef Map<LO,GO,NO> map_type;
99  typedef RowMatrixTransposer<SC,LO,GO,NO> transposer_type;
100 
101 #ifdef HAVE_TPETRA_MMM_TIMINGS
102  std::string prefix_mmm = std::string("TpetraExt ") + label + std::string(": ");
103  using Teuchos::TimeMonitor;
104  RCP<Teuchos::TimeMonitor> MM = rcp(new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string("MMM All Setup"))));
105 #endif
106 
107  const std::string prefix = "TpetraExt::MatrixMatrix::Multiply(): ";
108 
109  // TEUCHOS_FUNC_TIME_MONITOR_DIFF("My Matrix Mult", mmm_multiply);
110 
111  // The input matrices A and B must both be fillComplete.
112  TEUCHOS_TEST_FOR_EXCEPTION(!A.isFillComplete(), std::runtime_error, prefix << "Matrix A is not fill complete.");
113  TEUCHOS_TEST_FOR_EXCEPTION(!B.isFillComplete(), std::runtime_error, prefix << "Matrix B is not fill complete.");
114 
115  // If transposeA is true, then Aprime will be the transpose of A
116  // (computed explicitly via RowMatrixTransposer). Otherwise, Aprime
117  // will just be a pointer to A.
118  RCP<const crs_matrix_type> Aprime = null;
119  // If transposeB is true, then Bprime will be the transpose of B
120  // (computed explicitly via RowMatrixTransposer). Otherwise, Bprime
121  // will just be a pointer to B.
122  RCP<const crs_matrix_type> Bprime = null;
123 
124  // Is this a "clean" matrix?
125  //
126  // mfh 27 Sep 2016: Historically, if Epetra_CrsMatrix was neither
127  // locally nor globally indexed, then it was empty. I don't like
128  // this, because the most straightforward implementation presumes
129  // lazy allocation of indices. However, historical precedent
130  // demands that we keep around this predicate as a way to test
131  // whether the matrix is empty.
132  const bool newFlag = !C.getGraph()->isLocallyIndexed() && !C.getGraph()->isGloballyIndexed();
133 
134  bool use_optimized_ATB = false;
135  if (transposeA && !transposeB && call_FillComplete_on_result && newFlag)
136  use_optimized_ATB = true;
137 
138 #ifdef USE_OLD_TRANSPOSE // NOTE: For Grey Ballard's use. Remove this later.
139  use_optimized_ATB = false;
140 #endif
141 
142  if (!use_optimized_ATB && transposeA) {
143  transposer_type transposer(rcpFromRef (A));
144  Aprime = transposer.createTranspose();
145 
146  } else {
147  Aprime = rcpFromRef(A);
148  }
149 
150  if (transposeB) {
151  transposer_type transposer(rcpFromRef (B));
152  Bprime = transposer.createTranspose();
153 
154  } else {
155  Bprime = rcpFromRef(B);
156  }
157 
158  // Check size compatibility
159  global_size_t numACols = A.getDomainMap()->getGlobalNumElements();
160  global_size_t numBCols = B.getDomainMap()->getGlobalNumElements();
161  global_size_t Aouter = transposeA ? numACols : A.getGlobalNumRows();
162  global_size_t Bouter = transposeB ? B.getGlobalNumRows() : numBCols;
163  global_size_t Ainner = transposeA ? A.getGlobalNumRows() : numACols;
164  global_size_t Binner = transposeB ? numBCols : B.getGlobalNumRows();
165  TEUCHOS_TEST_FOR_EXCEPTION(Ainner != Binner, std::runtime_error,
166  prefix << "ERROR, inner dimensions of op(A) and op(B) "
167  "must match for matrix-matrix product. op(A) is "
168  << Aouter << "x" << Ainner << ", op(B) is "<< Binner << "x" << Bouter);
169 
170  // The result matrix C must at least have a row-map that reflects the correct
171  // row-size. Don't check the number of columns because rectangular matrices
172  // which were constructed with only one map can still end up having the
173  // correct capacity and dimensions when filled.
174  TEUCHOS_TEST_FOR_EXCEPTION(Aouter > C.getGlobalNumRows(), std::runtime_error,
175  prefix << "ERROR, dimensions of result C must "
176  "match dimensions of op(A) * op(B). C has " << C.getGlobalNumRows()
177  << " rows, should have at least " << Aouter << std::endl);
178 
179  // It doesn't matter whether C is already Filled or not. If it is already
180  // Filled, it must have space allocated for the positions that will be
181  // referenced in forming C = op(A)*op(B). If it doesn't have enough space,
182  // we'll error out later when trying to store result values.
183 
184  // CGB: However, matrix must be in active-fill
185  TEUCHOS_TEST_FOR_EXCEPT( C.isFillActive() == false );
186 
187  // We're going to need to import remotely-owned sections of A and/or B if
188  // more than one processor is performing this run, depending on the scenario.
189  int numProcs = A.getComm()->getSize();
190 
191  // Declare a couple of structs that will be used to hold views of the data
192  // of A and B, to be used for fast access during the matrix-multiplication.
193  crs_matrix_struct_type Aview;
194  crs_matrix_struct_type Bview;
195 
196  RCP<const map_type> targetMap_A = Aprime->getRowMap();
197  RCP<const map_type> targetMap_B = Bprime->getRowMap();
198 
199 #ifdef HAVE_TPETRA_MMM_TIMINGS
200  MM = rcp(new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string("MMM All I&X"))));
201 #endif
202 
203  // Now import any needed remote rows and populate the Aview struct
204  // NOTE: We assert that an import isn't needed --- since we do the transpose
205  // above to handle that.
206  if (!use_optimized_ATB) {
207  RCP<const import_type> dummyImporter;
208  MMdetails::import_and_extract_views(*Aprime, targetMap_A, Aview, dummyImporter, true, label, params);
209  }
210 
211  // We will also need local access to all rows of B that correspond to the
212  // column-map of op(A).
213  if (numProcs > 1)
214  targetMap_B = Aprime->getColMap();
215 
216  // Import any needed remote rows and populate the Bview struct.
217  if (!use_optimized_ATB)
218  MMdetails::import_and_extract_views(*Bprime, targetMap_B, Bview, Aprime->getGraph()->getImporter(), false, label, params);
219 
220 #ifdef HAVE_TPETRA_MMM_TIMINGS
221  MM = rcp(new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string("MMM All Multiply"))));
222 #endif
223 
224  // Call the appropriate method to perform the actual multiplication.
225  if (use_optimized_ATB) {
226  MMdetails::mult_AT_B_newmatrix(A, B, C, label);
227 
228  } else if (call_FillComplete_on_result && newFlag) {
229  MMdetails::mult_A_B_newmatrix(Aview, Bview, C, label);
230 
231  } else if (call_FillComplete_on_result) {
232  MMdetails::mult_A_B_reuse(Aview, Bview, C, label);
233 
234  } else {
235  // mfh 27 Sep 2016: Is this the "slow" case? This
236  // "CrsWrapper_CrsMatrix" thing could perhaps be made to support
237  // thread-parallel inserts, but that may take some effort.
238  CrsWrapper_CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node> crsmat(C);
239 
240  MMdetails::mult_A_B(Aview, Bview, crsmat, label);
241 
242 #ifdef HAVE_TPETRA_MMM_TIMINGS
243  MM = rcp(new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string("MMM All FillComplete"))));
244 #endif
245  if (call_FillComplete_on_result) {
246  // We'll call FillComplete on the C matrix before we exit, and give it a
247  // domain-map and a range-map.
248  // The domain-map will be the domain-map of B, unless
249  // op(B)==transpose(B), in which case the range-map of B will be used.
250  // The range-map will be the range-map of A, unless op(A)==transpose(A),
251  // in which case the domain-map of A will be used.
252  if (!C.isFillComplete())
253  C.fillComplete(Bprime->getDomainMap(), Aprime->getRangeMap());
254  }
255  }
256 }
257 
258 
259 template <class Scalar,
260  class LocalOrdinal,
261  class GlobalOrdinal,
262  class Node>
263 void Jacobi(Scalar omega,
268  bool call_FillComplete_on_result,
269  const std::string& label,
270  const Teuchos::RCP<Teuchos::ParameterList>& params)
271 {
272  using Teuchos::RCP;
273  typedef Scalar SC;
274  typedef LocalOrdinal LO;
275  typedef GlobalOrdinal GO;
276  typedef Node NO;
277  typedef Import<LO,GO,NO> import_type;
278  typedef CrsMatrixStruct<SC,LO,GO,NO> crs_matrix_struct_type;
279  typedef Map<LO,GO,NO> map_type;
280  typedef CrsMatrix<SC,LO,GO,NO> crs_matrix_type;
281 
282 #ifdef HAVE_TPETRA_MMM_TIMINGS
283  std::string prefix_mmm = std::string("TpetraExt ")+ label + std::string(": ");
284  using Teuchos::TimeMonitor;
285  RCP<Teuchos::TimeMonitor> MM = rcp(new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm+std::string("Jacobi All Setup"))));
286 #endif
287 
288  const std::string prefix = "TpetraExt::MatrixMatrix::Jacobi(): ";
289 
290  // A and B should already be Filled.
291  // Should we go ahead and call FillComplete() on them if necessary or error
292  // out? For now, we choose to error out.
293  TEUCHOS_TEST_FOR_EXCEPTION(!A.isFillComplete(), std::runtime_error, prefix << "Matrix A is not fill complete.");
294  TEUCHOS_TEST_FOR_EXCEPTION(!B.isFillComplete(), std::runtime_error, prefix << "Matrix B is not fill complete.");
295 
296  RCP<const crs_matrix_type> Aprime = rcpFromRef(A);
297  RCP<const crs_matrix_type> Bprime = rcpFromRef(B);
298 
299  // Now check size compatibility
300  global_size_t numACols = A.getDomainMap()->getGlobalNumElements();
301  global_size_t numBCols = B.getDomainMap()->getGlobalNumElements();
302  global_size_t Aouter = A.getGlobalNumRows();
303  global_size_t Bouter = numBCols;
304  global_size_t Ainner = numACols;
305  global_size_t Binner = B.getGlobalNumRows();
306  TEUCHOS_TEST_FOR_EXCEPTION(Ainner != Binner, std::runtime_error,
307  prefix << "ERROR, inner dimensions of op(A) and op(B) "
308  "must match for matrix-matrix product. op(A) is "
309  << Aouter << "x" << Ainner << ", op(B) is "<< Binner << "x" << Bouter);
310 
311  // The result matrix C must at least have a row-map that reflects the correct
312  // row-size. Don't check the number of columns because rectangular matrices
313  // which were constructed with only one map can still end up having the
314  // correct capacity and dimensions when filled.
315  TEUCHOS_TEST_FOR_EXCEPTION(Aouter > C.getGlobalNumRows(), std::runtime_error,
316  prefix << "ERROR, dimensions of result C must "
317  "match dimensions of op(A) * op(B). C has "<< C.getGlobalNumRows()
318  << " rows, should have at least "<< Aouter << std::endl);
319 
320  // It doesn't matter whether C is already Filled or not. If it is already
321  // Filled, it must have space allocated for the positions that will be
322  // referenced in forming C = op(A)*op(B). If it doesn't have enough space,
323  // we'll error out later when trying to store result values.
324 
325  // CGB: However, matrix must be in active-fill
326  TEUCHOS_TEST_FOR_EXCEPT( C.isFillActive() == false );
327 
328  // We're going to need to import remotely-owned sections of A and/or B if
329  // more than one processor is performing this run, depending on the scenario.
330  int numProcs = A.getComm()->getSize();
331 
332  // Declare a couple of structs that will be used to hold views of the data of
333  // A and B, to be used for fast access during the matrix-multiplication.
334  crs_matrix_struct_type Aview;
335  crs_matrix_struct_type Bview;
336 
337  RCP<const map_type> targetMap_A = Aprime->getRowMap();
338  RCP<const map_type> targetMap_B = Bprime->getRowMap();
339 
340 #ifdef HAVE_TPETRA_MMM_TIMINGS
341  MM = rcp(new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string("Jacobi All I&X"))));
342 #endif
343 
344  //Now import any needed remote rows and populate the Aview struct.
345  RCP<const import_type> dummyImporter;
346  MMdetails::import_and_extract_views(*Aprime, targetMap_A, Aview, dummyImporter, false, label);
347 
348  // We will also need local access to all rows of B that correspond to the
349  // column-map of op(A).
350  if (numProcs > 1)
351  targetMap_B = Aprime->getColMap();
352 
353  // Now import any needed remote rows and populate the Bview struct.
354  MMdetails::import_and_extract_views(*Bprime, targetMap_B, Bview, Aprime->getGraph()->getImporter(), false, label);
355 
356 #ifdef HAVE_TPETRA_MMM_TIMINGS
357  MM = rcp(new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string("Jacobi All Multiply"))));
358 #endif
359 
360  // Now call the appropriate method to perform the actual multiplication.
361  CrsWrapper_CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node> crsmat(C);
362 
363  // Is this a "clean" matrix
364  bool newFlag = !C.getGraph()->isLocallyIndexed() && !C.getGraph()->isGloballyIndexed();
365 
366  if (call_FillComplete_on_result && newFlag) {
367  MMdetails::jacobi_A_B_newmatrix(omega, Dinv, Aview, Bview, C, label);
368 
369  } else if (call_FillComplete_on_result) {
370  MMdetails::jacobi_A_B_reuse(omega, Dinv, Aview, Bview, C, label);
371 
372  } else {
373  TEUCHOS_TEST_FOR_EXCEPTION(true, std::runtime_error, "jacobi_A_B_general not implemented");
374  // FIXME (mfh 03 Apr 2014) This statement is unreachable, so I'm
375  // commenting it out.
376 // #ifdef HAVE_TPETRA_MMM_TIMINGS
377 // MM = rcp(new TimeMonitor(*TimeMonitor::getNewTimer("TpetraExt: Jacobi FillComplete")));
378 // #endif
379  // FIXME (mfh 03 Apr 2014) This statement is unreachable, so I'm
380  // commenting it out.
381  // if (call_FillComplete_on_result) {
382  // //We'll call FillComplete on the C matrix before we exit, and give
383  // //it a domain-map and a range-map.
384  // //The domain-map will be the domain-map of B, unless
385  // //op(B)==transpose(B), in which case the range-map of B will be used.
386  // //The range-map will be the range-map of A, unless
387  // //op(A)==transpose(A), in which case the domain-map of A will be used.
388  // if (!C.isFillComplete()) {
389  // C.fillComplete(Bprime->getDomainMap(), Aprime->getRangeMap());
390  // }
391  // }
392  }
393 }
394 
395 
396 template <class Scalar,
397  class LocalOrdinal,
398  class GlobalOrdinal,
399  class Node>
400 void Add(
402  bool transposeA,
403  Scalar scalarA,
405  Scalar scalarB )
406 {
407  using Teuchos::Array;
408  using Teuchos::RCP;
409  using Teuchos::null;
410  typedef Scalar SC;
411  typedef LocalOrdinal LO;
412  typedef GlobalOrdinal GO;
413  typedef Node NO;
414  typedef CrsMatrix<SC,LO,GO,NO> crs_matrix_type;
415  typedef RowMatrixTransposer<SC,LO,GO,NO> transposer_type;
416 
417  const std::string prefix = "TpetraExt::MatrixMatrix::Add(): ";
418 
419  TEUCHOS_TEST_FOR_EXCEPTION(!A.isFillComplete(), std::runtime_error,
420  prefix << "ERROR, input matrix A.isFillComplete() is false; it is required to be true. "
421  "(Result matrix B is not required to be isFillComplete()).");
422  TEUCHOS_TEST_FOR_EXCEPTION(B.isFillComplete() , std::runtime_error,
423  prefix << "ERROR, input matrix B must not be fill complete!");
424  TEUCHOS_TEST_FOR_EXCEPTION(B.isStaticGraph() , std::runtime_error,
425  prefix << "ERROR, input matrix B must not have static graph!");
426  TEUCHOS_TEST_FOR_EXCEPTION(B.isLocallyIndexed() , std::runtime_error,
427  prefix << "ERROR, input matrix B must not be locally indexed!");
428  TEUCHOS_TEST_FOR_EXCEPTION(B.getProfileType()!=DynamicProfile, std::runtime_error,
429  prefix << "ERROR, input matrix B must have a dynamic profile!");
430 
431 
432  RCP<const crs_matrix_type> Aprime = null;
433  if (transposeA) {
434  transposer_type transposer(rcpFromRef (A));
435  Aprime = transposer.createTranspose();
436  } else {
437  Aprime = rcpFromRef(A);
438  }
439 
440  size_t a_numEntries;
441  Array<GO> a_inds(A.getNodeMaxNumRowEntries());
442  Array<SC> a_vals(A.getNodeMaxNumRowEntries());
443  GO row;
444 
445  if (scalarB != Teuchos::ScalarTraits<SC>::one())
446  B.scale(scalarB);
447 
448  bool bFilled = B.isFillComplete();
449  size_t numMyRows = B.getNodeNumRows();
450  if (scalarA != Teuchos::ScalarTraits<SC>::zero()) {
451  for (LO i = 0; (size_t)i < numMyRows; ++i) {
452  row = B.getRowMap()->getGlobalElement(i);
453  Aprime->getGlobalRowCopy(row, a_inds(), a_vals(), a_numEntries);
454 
455  if (scalarA != Teuchos::ScalarTraits<SC>::one())
456  for (size_t j = 0; j < a_numEntries; ++j)
457  a_vals[j] *= scalarA;
458 
459  if (bFilled)
460  B.sumIntoGlobalValues(row, a_inds(0,a_numEntries), a_vals(0,a_numEntries));
461  else
462  B.insertGlobalValues(row, a_inds(0,a_numEntries), a_vals(0,a_numEntries));
463  }
464  }
465 }
466 
467 
468 template <class Scalar,
469  class LocalOrdinal,
470  class GlobalOrdinal,
471  class Node>
472 Teuchos::RCP<CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node> >
473 add (const Scalar& alpha,
474  const bool transposeA,
476  const Scalar& beta,
477  const bool transposeB,
479  const Teuchos::RCP<const Map<LocalOrdinal, GlobalOrdinal, Node> >& domainMap,
480  const Teuchos::RCP<const Map<LocalOrdinal, GlobalOrdinal, Node> >& rangeMap,
481  const Teuchos::RCP<Teuchos::ParameterList>& params)
482 {
483  using Teuchos::RCP;
484  using Teuchos::rcpFromRef;
485  using Teuchos::rcp_implicit_cast;
486  using Teuchos::rcp_dynamic_cast;
487 
488  typedef Scalar SC;
489  typedef LocalOrdinal LO;
490  typedef GlobalOrdinal GO;
491  typedef Node NO;
492  typedef RowMatrix<SC,LO,GO,NO> row_matrix_type;
493  typedef CrsMatrix<SC,LO,GO,NO> crs_matrix_type;
494  typedef RowMatrixTransposer<SC,LO,GO,NO> transposer_type;
495 
496  const std::string prefix = "TpetraExt::MatrixMatrix::add(): ";
497 
498  TEUCHOS_TEST_FOR_EXCEPTION(!A.isFillComplete () || !B.isFillComplete (), std::invalid_argument,
499  prefix << "A and B must both be fill complete.");
500 
501 #ifdef HAVE_TPETRA_DEBUG
502  // The matrices don't have domain or range Maps unless they are fill complete.
503  if (A.isFillComplete () && B.isFillComplete ()) {
504  const bool domainMapsSame =
505  (!transposeA && !transposeB && !A.getDomainMap()->isSameAs (*B.getDomainMap ())) ||
506  (!transposeA && transposeB && !A.getDomainMap()->isSameAs (*B.getRangeMap ())) ||
507  ( transposeA && !transposeB && !A.getRangeMap ()->isSameAs (*B.getDomainMap ()));
508  TEUCHOS_TEST_FOR_EXCEPTION(domainMapsSame, std::invalid_argument,
509  prefix << "The domain Maps of Op(A) and Op(B) are not the same.");
510 
511  const bool rangeMapsSame =
512  (!transposeA && !transposeB && !A.getRangeMap ()->isSameAs (*B.getRangeMap ())) ||
513  (!transposeA && transposeB && !A.getRangeMap ()->isSameAs (*B.getDomainMap())) ||
514  ( transposeA && !transposeB && !A.getDomainMap()->isSameAs (*B.getRangeMap ()));
515  TEUCHOS_TEST_FOR_EXCEPTION(rangeMapsSame, std::invalid_argument,
516  prefix << "The range Maps of Op(A) and Op(B) are not the same.");
517  }
518 #endif // HAVE_TPETRA_DEBUG
519 
520  // Form the explicit transpose of A if necessary.
521  RCP<const crs_matrix_type> Aprime;
522  if (transposeA) {
523  transposer_type transposer (rcpFromRef (A));
524  Aprime = transposer.createTranspose ();
525 
526  } else {
527  Aprime = rcpFromRef (A);
528  }
529 
530 #ifdef HAVE_TPETRA_DEBUG
531  TEUCHOS_TEST_FOR_EXCEPTION(Aprime.is_null (), std::logic_error,
532  prefix << "Failed to compute Op(A). Please report this bug to the Tpetra developers.");
533 #endif // HAVE_TPETRA_DEBUG
534 
535  // Form the explicit transpose of B if necessary.
536  RCP<const crs_matrix_type> Bprime;
537  if (transposeB) {
538  transposer_type transposer (rcpFromRef (B));
539  Bprime = transposer.createTranspose ();
540 
541  } else {
542  Bprime = rcpFromRef (B);
543  }
544 
545 #ifdef HAVE_TPETRA_DEBUG
546  TEUCHOS_TEST_FOR_EXCEPTION(Bprime.is_null (), std::logic_error,
547  prefix << "Failed to compute Op(B). Please report this bug to the Tpetra developers.");
548 
549  TEUCHOS_TEST_FOR_EXCEPTION(
550  !Aprime->isFillComplete () || !Bprime->isFillComplete (), std::invalid_argument,
551  prefix << "Aprime and Bprime must both be fill complete. "
552  "Please report this bug to the Tpetra developers.");
553 #endif // HAVE_TPETRA_DEBUG
554 
555  RCP<row_matrix_type> C =
556  Bprime->add (alpha, *rcp_implicit_cast<const row_matrix_type> (Aprime),
557  beta, domainMap, rangeMap, params);
558 
559  return rcp_dynamic_cast<crs_matrix_type> (C);
560 }
561 
562 
563 template <class Scalar,
564  class LocalOrdinal,
565  class GlobalOrdinal,
566  class Node>
567 void Add(
569  bool transposeA,
570  Scalar scalarA,
572  bool transposeB,
573  Scalar scalarB,
575 {
576  using Teuchos::as;
577  using Teuchos::Array;
578  using Teuchos::ArrayRCP;
579  using Teuchos::ArrayView;
580  using Teuchos::RCP;
581  using Teuchos::rcp;
582  using Teuchos::rcp_dynamic_cast;
583  using Teuchos::rcpFromRef;
584  using Teuchos::tuple;
585  using std::endl;
586  // typedef typename ArrayView<const Scalar>::size_type size_type;
587  typedef Teuchos::ScalarTraits<Scalar> STS;
589  // typedef Import<LocalOrdinal, GlobalOrdinal, Node> import_type;
590  // typedef RowGraph<LocalOrdinal, GlobalOrdinal, Node> row_graph_type;
591  // typedef CrsGraph<LocalOrdinal, GlobalOrdinal, Node> crs_graph_type;
594 
595  std::string prefix = "TpetraExt::MatrixMatrix::Add(): ";
596 
597  TEUCHOS_TEST_FOR_EXCEPTION(C.is_null (), std::logic_error,
598  prefix << "The case C == null does not actually work. Fixing this will require an interface change.");
599 
600  TEUCHOS_TEST_FOR_EXCEPTION(
601  ! A.isFillComplete () || ! B.isFillComplete (), std::invalid_argument,
602  prefix << "Both input matrices must be fill complete before calling this function.");
603 
604 #ifdef HAVE_TPETRA_DEBUG
605  {
606  const bool domainMapsSame =
607  (! transposeA && ! transposeB && ! A.getDomainMap ()->isSameAs (* (B.getDomainMap ()))) ||
608  (! transposeA && transposeB && ! A.getDomainMap ()->isSameAs (* (B.getRangeMap ()))) ||
609  (transposeA && ! transposeB && ! A.getRangeMap ()->isSameAs (* (B.getDomainMap ())));
610  TEUCHOS_TEST_FOR_EXCEPTION(domainMapsSame, std::invalid_argument,
611  prefix << "The domain Maps of Op(A) and Op(B) are not the same.");
612 
613  const bool rangeMapsSame =
614  (! transposeA && ! transposeB && ! A.getRangeMap ()->isSameAs (* (B.getRangeMap ()))) ||
615  (! transposeA && transposeB && ! A.getRangeMap ()->isSameAs (* (B.getDomainMap ()))) ||
616  (transposeA && ! transposeB && ! A.getDomainMap ()->isSameAs (* (B.getRangeMap ())));
617  TEUCHOS_TEST_FOR_EXCEPTION(rangeMapsSame, std::invalid_argument,
618  prefix << "The range Maps of Op(A) and Op(B) are not the same.");
619  }
620 #endif // HAVE_TPETRA_DEBUG
621 
622  // Form the explicit transpose of A if necessary.
623  RCP<const crs_matrix_type> Aprime;
624  if (transposeA) {
625  transposer_type theTransposer (rcpFromRef (A));
626  Aprime = theTransposer.createTranspose ();
627  } else {
628  Aprime = rcpFromRef (A);
629  }
630 
631 #ifdef HAVE_TPETRA_DEBUG
632  TEUCHOS_TEST_FOR_EXCEPTION(Aprime.is_null (), std::logic_error,
633  prefix << "Failed to compute Op(A). Please report this bug to the Tpetra developers.");
634 #endif // HAVE_TPETRA_DEBUG
635 
636  // Form the explicit transpose of B if necessary.
637  RCP<const crs_matrix_type> Bprime;
638  if (transposeB) {
639  transposer_type theTransposer (rcpFromRef (B));
640  Bprime = theTransposer.createTranspose ();
641  } else {
642  Bprime = rcpFromRef (B);
643  }
644 
645 #ifdef HAVE_TPETRA_DEBUG
646  TEUCHOS_TEST_FOR_EXCEPTION(Bprime.is_null (), std::logic_error,
647  prefix << "Failed to compute Op(B). Please report this bug to the Tpetra developers.");
648 #endif // HAVE_TPETRA_DEBUG
649 
650  // Allocate or zero the entries of the result matrix.
651  if (! C.is_null ()) {
652  C->setAllToScalar (STS::zero ());
653  } else {
654 #if 0
655  // If Aprime and Bprime have the same row Map, and if C is null,
656  // we can optimize construction and fillComplete of C. For now,
657  // we just check pointer equality, to avoid the all-reduce in
658  // isSameAs. It may be worth that all-reduce to check, however.
659  //if (Aprime->getRowMap ().getRawPtr () == Bprime->getRowMap ().getRawPtr ())
660  if (Aprime->getRowMap ()->isSameAs (* (Bprime->getRowMap ())) {
661  RCP<const map_type> rowMap = Aprime->getRowMap ();
662 
663  RCP<const crs_graph_type> A_graph =
664  rcp_dynamic_cast<const crs_graph_type> (Aprime->getGraph (), true);
665 #ifdef HAVE_TPETRA_DEBUG
666  TEUCHOS_TEST_FOR_EXCEPTION(A_graph.is_null (), std::logic_error,
667  "Tpetra::MatrixMatrix::Add: Graph of Op(A) is null. "
668  "Please report this bug to the Tpetra developers.");
669 #endif // HAVE_TPETRA_DEBUG
670  RCP<const crs_graph_type> B_graph =
671  rcp_dynamic_cast<const crs_graph_type> (Bprime->getGraph (), true);
672 #ifdef HAVE_TPETRA_DEBUG
673  TEUCHOS_TEST_FOR_EXCEPTION(B_graph.is_null (), std::logic_error,
674  "Tpetra::MatrixMatrix::Add: Graph of Op(B) is null. "
675  "Please report this bug to the Tpetra developers.");
676 #endif // HAVE_TPETRA_DEBUG
677  RCP<const map_type> A_colMap = A_graph->getColMap ();
678 #ifdef HAVE_TPETRA_DEBUG
679  TEUCHOS_TEST_FOR_EXCEPTION(A_colMap.is_null (), std::logic_error,
680  "Tpetra::MatrixMatrix::Add: Column Map of Op(A) is null. "
681  "Please report this bug to the Tpetra developers.");
682 #endif // HAVE_TPETRA_DEBUG
683  RCP<const map_type> B_colMap = B_graph->getColMap ();
684 #ifdef HAVE_TPETRA_DEBUG
685  TEUCHOS_TEST_FOR_EXCEPTION(B_colMap.is_null (), std::logic_error,
686  "Tpetra::MatrixMatrix::Add: Column Map of Op(B) is null. "
687  "Please report this bug to the Tpetra developers.");
688  TEUCHOS_TEST_FOR_EXCEPTION(A_graph->getImporter ().is_null (),
689  std::logic_error,
690  "Tpetra::MatrixMatrix::Add: Op(A)'s Import is null. "
691  "Please report this bug to the Tpetra developers.");
692  TEUCHOS_TEST_FOR_EXCEPTION(B_graph->getImporter ().is_null (),
693  std::logic_error,
694  "Tpetra::MatrixMatrix::Add: Op(B)'s Import is null. "
695  "Please report this bug to the Tpetra developers.");
696 #endif // HAVE_TPETRA_DEBUG
697 
698  // Compute the (column Map and) Import of the matrix sum.
699  RCP<const import_type> sumImport =
700  A_graph->getImporter ()->setUnion (* (B_graph->getImporter ()));
701  RCP<const map_type> C_colMap = sumImport->getTargetMap ();
702 
703  // First, count the number of entries in each row. Then, go
704  // back over the rows again, and compute the actual sum.
705  // Remember that C may have a different column Map than Aprime
706  // or Bprime, so its local indices may be different. That's why
707  // we have to convert from local to global indices.
708 
709  ArrayView<const LocalOrdinal> A_local_ind;
710  Array<GlobalOrdinal> A_global_ind;
711  ArrayView<const LocalOrdinal> B_local_ind;
712  Array<GlobalOrdinal> B_global_ind;
713 
714  const size_t localNumRows = rowMap->getNodeNumElements ();
715  ArrayRCP<size_t> numEntriesPerRow (localNumRows);
716  // Compute the max number of entries in any row of A+B on this
717  // process, so that we won't have to resize temporary arrays.
718  size_t maxNumEntriesPerRow = 0;
719  for (size_t localRow = 0; localRow < localNumRows; ++localRow) {
720  // Get view of current row of A_graph, in its local indices.
721  A_graph->getLocalRowView (as<LocalOrdinal> (localRow), A_local_ind);
722  const size_type A_numEnt = A_local_ind.size ();
723  if (A_numEnt > A_global_ind.size ()) {
724  A_global_ind.resize (A_numEnt);
725  }
726  // Convert A's local indices to global indices.
727  for (size_type k = 0; k < A_numEnt; ++k) {
728  A_global_ind[k] = A_colMap->getGlobalElement (A_local_ind[k]);
729  }
730 
731  // Get view of current row of B_graph, in its local indices.
732  B_graph->getLocalRowView (as<LocalOrdinal> (localRow), B_local_ind);
733  const size_type B_numEnt = B_local_ind.size ();
734  if (B_numEnt > B_global_ind.size ()) {
735  B_global_ind.resize (B_numEnt);
736  }
737  // Convert B's local indices to global indices.
738  for (size_type k = 0; k < B_numEnt; ++k) {
739  B_global_ind[k] = B_colMap->getGlobalElement (B_local_ind[k]);
740  }
741 
742  // Count the number of entries in the merged row of A + B.
743  const size_t curNumEntriesPerRow =
744  keyMergeCount (A_global_ind.begin (), A_global_ind.end (),
745  B_global_ind.begin (), B_global_ind.end ());
746  numEntriesPerRow[localRow] = curNumEntriesPerRow;
747  maxNumEntriesPerRow = std::max (maxNumEntriesPerRow, curNumEntriesPerRow);
748  }
749 
750  // Create C, using the sum column Map and number of entries per
751  // row that we computed above. Having the exact number of
752  // entries per row lets us use static profile, making it valid
753  // to call expertStaticFillComplete.
754  C = rcp (new crs_matrix_type (rowMap, C_colMap, numEntriesPerRow, StaticProfile));
755 
756  // Go back through the rows and actually compute the sum. We
757  // don't ever have to resize A_global_ind or B_global_ind below,
758  // since we've already done it above.
759  ArrayView<const Scalar> A_val;
760  ArrayView<const Scalar> B_val;
761 
762  Array<LocalOrdinal> AplusB_local_ind (maxNumEntriesPerRow);
763  Array<GlobalOrdinal> AplusB_global_ind (maxNumEntriesPerRow);
764  Array<Scalar> AplusB_val (maxNumEntriesPerRow);
765 
766  for (size_t localRow = 0; localRow < localNumRows; ++localRow) {
767  // Get view of current row of A, in A's local indices.
768  Aprime->getLocalRowView (as<LocalOrdinal> (localRow), A_local_ind, A_val);
769  // Convert A's local indices to global indices.
770  for (size_type k = 0; k < A_local_ind.size (); ++k) {
771  A_global_ind[k] = A_colMap->getGlobalElement (A_local_ind[k]);
772  }
773 
774  // Get view of current row of B, in B's local indices.
775  Bprime->getLocalRowView (as<LocalOrdinal> (localRow), B_local_ind, B_val);
776  // Convert B's local indices to global indices.
777  for (size_type k = 0; k < B_local_ind.size (); ++k) {
778  B_global_ind[k] = B_colMap->getGlobalElement (B_local_ind[k]);
779  }
780 
781  const size_t curNumEntries = numEntriesPerRow[localRow];
782  ArrayView<LocalOrdinal> C_local_ind = AplusB_local_ind (0, curNumEntries);
783  ArrayView<GlobalOrdinal> C_global_ind = AplusB_global_ind (0, curNumEntries);
784  ArrayView<Scalar> C_val = AplusB_val (0, curNumEntries);
785 
786  // Sum the entries in the current row of A plus B.
787  keyValueMerge (A_global_ind.begin (), A_global_ind.end (),
788  A_val.begin (), A_val.end (),
789  B_global_ind.begin (), B_global_ind.end (),
790  B_val.begin (), B_val.end (),
791  C_global_ind.begin (), C_val.begin (),
792  std::plus<Scalar> ());
793  // Convert the sum's global indices into C's local indices.
794  for (size_type k = 0; k < as<size_type> (numEntriesPerRow[localRow]); ++k) {
795  C_local_ind[k] = C_colMap->getLocalElement (C_global_ind[k]);
796  }
797  // Give the current row sum to C.
798  C->replaceLocalValues (localRow, C_local_ind, C_val);
799  }
800 
801  // Use "expert static fill complete" to bypass construction of
802  // the Import and Export (if applicable) object(s).
803  RCP<const map_type> domainMap = A_graph->getDomainMap ();
804  RCP<const map_type> rangeMap = A_graph->getRangeMap ();
805  C->expertStaticFillComplete (domainMap, rangeMap, sumImport, A_graph->getExporter ());
806 
807  return; // Now we're done!
808  }
809  else {
810  // FIXME (mfh 08 May 2013) When I first looked at this method, I
811  // noticed that C was being given the row Map of Aprime (the
812  // possibly transposed version of A). Is this what we want?
813  C = rcp (new crs_matrix_type (Aprime->getRowMap (), null));
814  }
815 
816 #else
817  // FIXME (mfh 08 May 2013) When I first looked at this method, I
818  // noticed that C was being given the row Map of Aprime (the
819  // possibly transposed version of A). Is this what we want?
820  C = rcp (new crs_matrix_type (Aprime->getRowMap (), 0));
821 #endif // 0
822  }
823 
824 #ifdef HAVE_TPETRA_DEBUG
825  TEUCHOS_TEST_FOR_EXCEPTION(Aprime.is_null (), std::logic_error,
826  prefix << "At this point, Aprime is null. Please report this bug to the Tpetra developers.");
827  TEUCHOS_TEST_FOR_EXCEPTION(Bprime.is_null (), std::logic_error,
828  prefix << "At this point, Bprime is null. Please report this bug to the Tpetra developers.");
829  TEUCHOS_TEST_FOR_EXCEPTION(C.is_null (), std::logic_error,
830  prefix << "At this point, C is null. Please report this bug to the Tpetra developers.");
831 #endif // HAVE_TPETRA_DEBUG
832 
833  Array<RCP<const crs_matrix_type> > Mat =
834  tuple<RCP<const crs_matrix_type> > (Aprime, Bprime);
835  Array<Scalar> scalar = tuple<Scalar> (scalarA, scalarB);
836 
837  // do a loop over each matrix to add: A reordering might be more efficient
838  for (int k = 0; k < 2; ++k) {
839  Array<GlobalOrdinal> Indices;
840  Array<Scalar> Values;
841 
842  // Loop over each locally owned row of the current matrix (either
843  // Aprime or Bprime), and sum its entries into the corresponding
844  // row of C. This works regardless of whether Aprime or Bprime
845  // has the same row Map as C, because both sumIntoGlobalValues and
846  // insertGlobalValues allow summing resp. inserting into nonowned
847  // rows of C.
848 #ifdef HAVE_TPETRA_DEBUG
849  TEUCHOS_TEST_FOR_EXCEPTION(Mat[k].is_null (), std::logic_error,
850  prefix << "At this point, curRowMap is null. Please report this bug to the Tpetra developers.");
851 #endif // HAVE_TPETRA_DEBUG
852  RCP<const map_type> curRowMap = Mat[k]->getRowMap ();
853 #ifdef HAVE_TPETRA_DEBUG
854  TEUCHOS_TEST_FOR_EXCEPTION(curRowMap.is_null (), std::logic_error,
855  prefix << "At this point, curRowMap is null. Please report this bug to the Tpetra developers.");
856 #endif // HAVE_TPETRA_DEBUG
857 
858  const size_t localNumRows = Mat[k]->getNodeNumRows ();
859  for (size_t i = 0; i < localNumRows; ++i) {
860  const GlobalOrdinal globalRow = curRowMap->getGlobalElement (i);
861  size_t numEntries = Mat[k]->getNumEntriesInGlobalRow (globalRow);
862  if (numEntries > 0) {
863  Indices.resize (numEntries);
864  Values.resize (numEntries);
865  Mat[k]->getGlobalRowCopy (globalRow, Indices (), Values (), numEntries);
866 
867  if (scalar[k] != STS::one ()) {
868  for (size_t j = 0; j < numEntries; ++j) {
869  Values[j] *= scalar[k];
870  }
871  }
872 
873  if (C->isFillComplete ()) {
874  C->sumIntoGlobalValues (globalRow, Indices, Values);
875  } else {
876  C->insertGlobalValues (globalRow, Indices, Values);
877  }
878  }
879  }
880  }
881 }
882 
883 
884 } //End namespace MatrixMatrix
885 
886 namespace MMdetails{
887 
888 // Prints MMM-style statistics on communication done with an Import or Export object
889 template <class TransferType>
890 void printMultiplicationStatistics(Teuchos::RCP<TransferType > Transfer, const std::string &label) {
891  if (Transfer.is_null())
892  return;
893 
894  const Distributor & Distor = Transfer->getDistributor();
895  Teuchos::RCP<const Teuchos::Comm<int> > Comm = Transfer->getSourceMap()->getComm();
896 
897  size_t rows_send = Transfer->getNumExportIDs();
898  size_t rows_recv = Transfer->getNumRemoteIDs();
899 
900  size_t round1_send = Transfer->getNumExportIDs() * sizeof(size_t);
901  size_t round1_recv = Transfer->getNumRemoteIDs() * sizeof(size_t);
902  size_t num_send_neighbors = Distor.getNumSends();
903  size_t num_recv_neighbors = Distor.getNumReceives();
904  size_t round2_send, round2_recv;
905  Distor.getLastDoStatistics(round2_send,round2_recv);
906 
907  int myPID = Comm->getRank();
908  int NumProcs = Comm->getSize();
909 
910  // Processor by processor statistics
911  // printf("[%d] %s Statistics: neigh[s/r]=%d/%d rows[s/r]=%d/%d r1bytes[s/r]=%d/%d r2bytes[s/r]=%d/%d\n",
912  // myPID, label.c_str(),num_send_neighbors,num_recv_neighbors,rows_send,rows_recv,round1_send,round1_recv,round2_send,round2_recv);
913 
914  // Global statistics
915  size_t lstats[8] = {num_send_neighbors,num_recv_neighbors,rows_send,rows_recv,round1_send,round1_recv,round2_send,round2_recv};
916  size_t gstats_min[8], gstats_max[8];
917 
918  double lstats_avg[8], gstats_avg[8];
919  for(int i=0; i<8; i++)
920  lstats_avg[i] = ((double)lstats[i])/NumProcs;
921 
922  Teuchos::reduceAll(*Comm(),Teuchos::REDUCE_MIN,8,lstats,gstats_min);
923  Teuchos::reduceAll(*Comm(),Teuchos::REDUCE_MAX,8,lstats,gstats_max);
924  Teuchos::reduceAll(*Comm(),Teuchos::REDUCE_SUM,8,lstats_avg,gstats_avg);
925 
926  if(!myPID) {
927  printf("%s Send Statistics[min/avg/max]: neigh=%d/%4.1f/%d rows=%d/%4.1f/%d round1=%d/%4.1f/%d round2=%d/%4.1f/%d\n", label.c_str(),
928  (int)gstats_min[0],gstats_avg[0],(int)gstats_max[0], (int)gstats_min[2],gstats_avg[2],(int)gstats_max[2],
929  (int)gstats_min[4],gstats_avg[4],(int)gstats_max[4], (int)gstats_min[6],gstats_avg[6],(int)gstats_max[6]);
930  printf("%s Recv Statistics[min/avg/max]: neigh=%d/%4.1f/%d rows=%d/%4.1f/%d round1=%d/%4.1f/%d round2=%d/%4.1f/%d\n", label.c_str(),
931  (int)gstats_min[1],gstats_avg[1],(int)gstats_max[1], (int)gstats_min[3],gstats_avg[3],(int)gstats_max[3],
932  (int)gstats_min[5],gstats_avg[5],(int)gstats_max[5], (int)gstats_min[7],gstats_avg[7],(int)gstats_max[7]);
933  }
934 }
935 
936 
937 // Kernel method for computing the local portion of C = A*B
938 template<class Scalar,
939  class LocalOrdinal,
940  class GlobalOrdinal,
941  class Node>
942 void mult_AT_B_newmatrix(
943  const CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>& A,
944  const CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>& B,
945  CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>& C,
946  const std::string & label,
947  const Teuchos::RCP<Teuchos::ParameterList>& params)
948 {
949  // Using & Typedefs
950  using Teuchos::RCP;
951  using Teuchos::rcp;
952  typedef Scalar SC;
953  typedef LocalOrdinal LO;
954  typedef GlobalOrdinal GO;
955  typedef Node NO;
956  typedef CrsMatrixStruct<SC,LO,GO,NO> crs_matrix_struct_type;
957  typedef RowMatrixTransposer<SC,LO,GO,NO> transposer_type;
958 
959 #ifdef HAVE_TPETRA_MMM_TIMINGS
960  std::string prefix_mmm = std::string("TpetraExt ") + label + std::string(": ");
961  using Teuchos::TimeMonitor;
962  RCP<Teuchos::TimeMonitor> MM = rcp(new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string("MMM-T Transpose"))));
963 #endif
964 
965  /*************************************************************/
966  /* 1) Local Transpose of A */
967  /*************************************************************/
968  transposer_type transposer (rcpFromRef (A));
969  RCP<CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node> > Atrans = transposer.createTransposeLocal();
970 
971  /*************************************************************/
972  /* 2/3) Call mult_A_B_newmatrix w/ fillComplete */
973  /*************************************************************/
974 #ifdef HAVE_TPETRA_MMM_TIMINGS
975  MM = rcp(new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string("MMM-T I&X"))));
976 #endif
977 
978  // Get views, asserting that no import is required to speed up computation
979  crs_matrix_struct_type Aview;
980  crs_matrix_struct_type Bview;
981  RCP<const Import<LocalOrdinal,GlobalOrdinal, Node> > dummyImporter;
982  MMdetails::import_and_extract_views(*Atrans, Atrans->getRowMap(), Aview, dummyImporter,true, label);
983  MMdetails::import_and_extract_views(B, B.getRowMap(), Bview, dummyImporter,true, label);
984 
985 #ifdef HAVE_TPETRA_MMM_TIMINGS
986  MM = rcp(new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string("MMM-T AB-core"))));
987 #endif
988 
989  RCP<Tpetra::CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node> >Ctemp;
990 
991  // If Atrans has no Exporter, we can use C instead of having to create a temp matrix
992  bool needs_final_export = !Atrans->getGraph()->getExporter().is_null();
993  if (needs_final_export)
994  Ctemp = rcp(new Tpetra::CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>(Atrans->getRowMap(),0));
995  else
996  Ctemp = rcp(&C,false);// don't allow deallocation
997 
998  // Multiply
999  mult_A_B_newmatrix(Aview, Bview, *Ctemp, label);
1000 
1001  /*************************************************************/
1002  /* 4) exportAndFillComplete matrix */
1003  /*************************************************************/
1004 #ifdef HAVE_TPETRA_MMM_TIMINGS
1005  MM = rcp(new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string("MMM-T exportAndFillComplete"))));
1006 #endif
1007 
1008  Teuchos::RCP<Tpetra::CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node> > Crcp(&C,false);
1009  if (needs_final_export) {
1010  Teuchos::ParameterList labelList;
1011  labelList.set("Timer Label", label);
1012  Ctemp->exportAndFillComplete(Crcp,*Ctemp->getGraph()->getExporter(),
1013  B.getDomainMap(),A.getDomainMap(),rcp(&labelList,false));
1014  }
1015 #ifdef HAVE_TPETRA_MMM_STATISTICS
1016  printMultiplicationStatistics(Ctemp->getGraph()->getExporter(), label+std::string(" AT_B MMM"));
1017 #endif
1018 }
1019 
1020 // Kernel method for computing the local portion of C = A*B
1021 template<class Scalar,
1022  class LocalOrdinal,
1023  class GlobalOrdinal,
1024  class Node>
1025 void mult_A_B(
1026  CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Aview,
1027  CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Bview,
1028  CrsWrapper<Scalar, LocalOrdinal, GlobalOrdinal, Node>& C,
1029  const std::string& label,
1030  const Teuchos::RCP<Teuchos::ParameterList>& params)
1031 {
1032  using Teuchos::Array;
1033  using Teuchos::ArrayRCP;
1034  using Teuchos::ArrayView;
1035  using Teuchos::OrdinalTraits;
1036  using Teuchos::null;
1037 
1038  typedef Teuchos::ScalarTraits<Scalar> STS;
1039  // TEUCHOS_FUNC_TIME_MONITOR_DIFF("mult_A_B", mult_A_B);
1040  LocalOrdinal C_firstCol = Bview.colMap->getMinLocalIndex();
1041  LocalOrdinal C_lastCol = Bview.colMap->getMaxLocalIndex();
1042 
1043  LocalOrdinal C_firstCol_import = OrdinalTraits<LocalOrdinal>::zero();
1044  LocalOrdinal C_lastCol_import = OrdinalTraits<LocalOrdinal>::invalid();
1045 
1046  ArrayView<const GlobalOrdinal> bcols = Bview.colMap->getNodeElementList();
1047  ArrayView<const GlobalOrdinal> bcols_import = null;
1048  if (Bview.importColMap != null) {
1049  C_firstCol_import = Bview.importColMap->getMinLocalIndex();
1050  C_lastCol_import = Bview.importColMap->getMaxLocalIndex();
1051 
1052  bcols_import = Bview.importColMap->getNodeElementList();
1053  }
1054 
1055  size_t C_numCols = C_lastCol - C_firstCol +
1056  OrdinalTraits<LocalOrdinal>::one();
1057  size_t C_numCols_import = C_lastCol_import - C_firstCol_import +
1058  OrdinalTraits<LocalOrdinal>::one();
1059 
1060  if (C_numCols_import > C_numCols)
1061  C_numCols = C_numCols_import;
1062 
1063  Array<Scalar> dwork = Array<Scalar>(C_numCols);
1064  Array<GlobalOrdinal> iwork = Array<GlobalOrdinal>(C_numCols);
1065  Array<size_t> iwork2 = Array<size_t>(C_numCols);
1066 
1067  Array<Scalar> C_row_i = dwork;
1068  Array<GlobalOrdinal> C_cols = iwork;
1069  Array<size_t> c_index = iwork2;
1070  Array<GlobalOrdinal> combined_index = Array<GlobalOrdinal>(2*C_numCols);
1071  Array<Scalar> combined_values = Array<Scalar>(2*C_numCols);
1072 
1073  size_t C_row_i_length, j, k, last_index;
1074 
1075  // Run through all the hash table lookups once and for all
1076  LocalOrdinal LO_INVALID = OrdinalTraits<LocalOrdinal>::invalid();
1077  Array<LocalOrdinal> Acol2Brow(Aview.colMap->getNodeNumElements(),LO_INVALID);
1078  Array<LocalOrdinal> Acol2Irow(Aview.colMap->getNodeNumElements(),LO_INVALID);
1079  if(Aview.colMap->isSameAs(*Bview.origMatrix->getRowMap())){
1080  // Maps are the same: Use local IDs as the hash
1081  for(LocalOrdinal i=Aview.colMap->getMinLocalIndex(); i <=
1082  Aview.colMap->getMaxLocalIndex(); i++)
1083  Acol2Brow[i]=i;
1084  }
1085  else {
1086  // Maps are not the same: Use the map's hash
1087  for(LocalOrdinal i=Aview.colMap->getMinLocalIndex(); i <=
1088  Aview.colMap->getMaxLocalIndex(); i++) {
1089  GlobalOrdinal GID = Aview.colMap->getGlobalElement(i);
1090  LocalOrdinal BLID = Bview.origMatrix->getRowMap()->getLocalElement(GID);
1091  if(BLID != LO_INVALID) Acol2Brow[i] = BLID;
1092  else Acol2Irow[i] = Bview.importMatrix->getRowMap()->getLocalElement(GID);
1093  }
1094  }
1095 
1096  // To form C = A*B we're going to execute this expression:
1097  //
1098  // C(i,j) = sum_k( A(i,k)*B(k,j) )
1099  //
1100  // Our goal, of course, is to navigate the data in A and B once, without
1101  // performing searches for column-indices, etc.
1102  ArrayRCP<const size_t> Arowptr_RCP, Browptr_RCP, Irowptr_RCP;
1103  ArrayRCP<const LocalOrdinal> Acolind_RCP, Bcolind_RCP, Icolind_RCP;
1104  ArrayRCP<const Scalar> Avals_RCP, Bvals_RCP, Ivals_RCP;
1105  ArrayView<const size_t> Arowptr, Browptr, Irowptr;
1106  ArrayView<const LocalOrdinal> Acolind, Bcolind, Icolind;
1107  ArrayView<const Scalar> Avals, Bvals, Ivals;
1108  Aview.origMatrix->getAllValues(Arowptr_RCP,Acolind_RCP,Avals_RCP);
1109  Bview.origMatrix->getAllValues(Browptr_RCP,Bcolind_RCP,Bvals_RCP);
1110  Arowptr = Arowptr_RCP(); Acolind = Acolind_RCP(); Avals = Avals_RCP();
1111  Browptr = Browptr_RCP(); Bcolind = Bcolind_RCP(); Bvals = Bvals_RCP();
1112  if(!Bview.importMatrix.is_null()) {
1113  Bview.importMatrix->getAllValues(Irowptr_RCP,Icolind_RCP,Ivals_RCP);
1114  Irowptr = Irowptr_RCP(); Icolind = Icolind_RCP(); Ivals = Ivals_RCP();
1115  }
1116 
1117  bool C_filled = C.isFillComplete();
1118 
1119  for (size_t i = 0; i < C_numCols; i++)
1120  c_index[i] = OrdinalTraits<size_t>::invalid();
1121 
1122  // Loop over the rows of A.
1123  size_t Arows = Aview.rowMap->getNodeNumElements();
1124  for(size_t i=0; i<Arows; ++i) {
1125 
1126  // Only navigate the local portion of Aview... which is, thankfully, all of
1127  // A since this routine doesn't do transpose modes
1128  GlobalOrdinal global_row = Aview.rowMap->getGlobalElement(i);
1129 
1130  // Loop across the i-th row of A and for each corresponding row in B, loop
1131  // across colums and accumulate product A(i,k)*B(k,j) into our partial sum
1132  // quantities C_row_i. In other words, as we stride across B(k,:) we're
1133  // calculating updates for row i of the result matrix C.
1134  C_row_i_length = OrdinalTraits<size_t>::zero();
1135 
1136  for (k = Arowptr[i]; k < Arowptr[i+1]; ++k) {
1137  LocalOrdinal Ak = Acol2Brow[Acolind[k]];
1138  Scalar Aval = Avals[k];
1139  if (Aval == STS::zero())
1140  continue;
1141 
1142  if (Ak == LO_INVALID)
1143  continue;
1144 
1145  for (j = Browptr[Ak]; j < Browptr[Ak+1]; ++j) {
1146  LocalOrdinal col = Bcolind[j];
1147  //assert(col >= 0 && col < C_numCols);
1148 
1149  if (c_index[col] == OrdinalTraits<size_t>::invalid()){
1150  //assert(C_row_i_length >= 0 && C_row_i_length < C_numCols);
1151  // This has to be a += so insertGlobalValue goes out
1152  C_row_i[C_row_i_length] = Aval*Bvals[j];
1153  C_cols[C_row_i_length] = col;
1154  c_index[col] = C_row_i_length;
1155  C_row_i_length++;
1156 
1157  } else {
1158  C_row_i[c_index[col]] += Aval*Bvals[j];
1159  }
1160  }
1161  }
1162 
1163  for (size_t ii = 0; ii < C_row_i_length; ii++) {
1164  c_index[C_cols[ii]] = OrdinalTraits<size_t>::invalid();
1165  C_cols[ii] = bcols[C_cols[ii]];
1166  combined_index[ii] = C_cols[ii];
1167  combined_values[ii] = C_row_i[ii];
1168  }
1169  last_index = C_row_i_length;
1170 
1171  //
1172  //Now put the C_row_i values into C.
1173  //
1174  // We might have to revamp this later.
1175  C_row_i_length = OrdinalTraits<size_t>::zero();
1176 
1177  for (k = Arowptr[i]; k < Arowptr[i+1]; ++k) {
1178  LocalOrdinal Ak = Acol2Brow[Acolind[k]];
1179  Scalar Aval = Avals[k];
1180  if (Aval == STS::zero())
1181  continue;
1182 
1183  if (Ak!=LO_INVALID) continue;
1184 
1185  Ak = Acol2Irow[Acolind[k]];
1186  for (j = Irowptr[Ak]; j < Irowptr[Ak+1]; ++j) {
1187  LocalOrdinal col = Icolind[j];
1188  //assert(col >= 0 && col < C_numCols);
1189 
1190  if (c_index[col] == OrdinalTraits<size_t>::invalid()) {
1191  //assert(C_row_i_length >= 0 && C_row_i_length < C_numCols);
1192  // This has to be a += so insertGlobalValue goes out
1193  C_row_i[C_row_i_length] = Aval*Ivals[j];
1194  C_cols[C_row_i_length] = col;
1195  c_index[col] = C_row_i_length;
1196  C_row_i_length++;
1197 
1198  } else {
1199  // This has to be a += so insertGlobalValue goes out
1200  C_row_i[c_index[col]] += Aval*Ivals[j];
1201  }
1202  }
1203  }
1204 
1205  for (size_t ii = 0; ii < C_row_i_length; ii++) {
1206  c_index[C_cols[ii]] = OrdinalTraits<size_t>::invalid();
1207  C_cols[ii] = bcols_import[C_cols[ii]];
1208  combined_index[last_index] = C_cols[ii];
1209  combined_values[last_index] = C_row_i[ii];
1210  last_index++;
1211  }
1212 
1213  // Now put the C_row_i values into C.
1214  // We might have to revamp this later.
1215  C_filled ?
1216  C.sumIntoGlobalValues(
1217  global_row,
1218  combined_index.view(OrdinalTraits<size_t>::zero(), last_index),
1219  combined_values.view(OrdinalTraits<size_t>::zero(), last_index))
1220  :
1221  C.insertGlobalValues(
1222  global_row,
1223  combined_index.view(OrdinalTraits<size_t>::zero(), last_index),
1224  combined_values.view(OrdinalTraits<size_t>::zero(), last_index));
1225 
1226  }
1227 }
1228 
1229 template<class Scalar,
1230  class LocalOrdinal,
1231  class GlobalOrdinal,
1232  class Node>
1233 void setMaxNumEntriesPerRow(CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Mview) {
1234  typedef typename Teuchos::Array<Teuchos::ArrayView<const LocalOrdinal> >::size_type local_length_size;
1235  Mview.maxNumRowEntries = Teuchos::OrdinalTraits<local_length_size>::zero();
1236 
1237  if (Mview.indices.size() > Teuchos::OrdinalTraits<local_length_size>::zero()) {
1238  Mview.maxNumRowEntries = Mview.indices[0].size();
1239 
1240  for (local_length_size i = 1; i < Mview.indices.size(); ++i)
1241  if (Mview.indices[i].size() > Mview.maxNumRowEntries)
1242  Mview.maxNumRowEntries = Mview.indices[i].size();
1243  }
1244 }
1245 
1246 
1247 template<class CrsMatrixType>
1248 size_t C_estimate_nnz(CrsMatrixType & A, CrsMatrixType &B){
1249  // Follows the NZ estimate in ML's ml_matmatmult.c
1250  size_t Aest = 100, Best=100;
1251  if (A.getNodeNumEntries() > 0)
1252  Aest = (A.getNodeNumRows() > 0)? A.getNodeNumEntries()/A.getNodeNumEntries() : 100;
1253  if (B.getNodeNumEntries() > 0)
1254  Best = (B.getNodeNumRows() > 0) ? B.getNodeNumEntries()/B.getNodeNumEntries() : 100;
1255 
1256  size_t nnzperrow = (size_t)(sqrt((double)Aest) + sqrt((double)Best) - 1);
1257  nnzperrow *= nnzperrow;
1258 
1259  return (size_t)(A.getNodeNumRows()*nnzperrow*0.75 + 100);
1260 }
1261 
1262 
1263 // Kernel method for computing the local portion of C = A*B
1264 //
1265 // mfh 27 Sep 2016: Currently, mult_AT_B_newmatrix() also calls this
1266 // function, so this is probably the function we want to
1267 // thread-parallelize.
1268 template<class Scalar,
1269  class LocalOrdinal,
1270  class GlobalOrdinal,
1271  class Node>
1272 void mult_A_B_newmatrix(
1273  CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Aview,
1274  CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Bview,
1275  CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>& C,
1276  const std::string& label,
1277  const Teuchos::RCP<Teuchos::ParameterList>& params)
1278 {
1279  using Teuchos::Array;
1280  using Teuchos::ArrayRCP;
1281  using Teuchos::ArrayView;
1282  using Teuchos::RCP;
1283  using Teuchos::rcp;
1284 
1285  typedef Scalar SC;
1286  typedef LocalOrdinal LO;
1287  typedef GlobalOrdinal GO;
1288  typedef Node NO;
1289 
1290  typedef Import<LO,GO,NO> import_type;
1291  typedef Map<LO,GO,NO> map_type;
1292 
1293 #ifdef HAVE_TPETRA_MMM_TIMINGS
1294  std::string prefix_mmm = std::string("TpetraExt ") + label + std::string(": ");
1295  using Teuchos::TimeMonitor;
1296  RCP<TimeMonitor> MM = rcp(new TimeMonitor(*(TimeMonitor::getNewTimer(prefix_mmm + std::string("MMM M5 Cmap")))));
1297 #endif
1298  size_t ST_INVALID = Teuchos::OrdinalTraits<LO>::invalid();
1299  LO LO_INVALID = Teuchos::OrdinalTraits<LO>::invalid();
1300 
1301  // Build the final importer / column map, hash table lookups for C
1302  RCP<const import_type> Cimport;
1303  RCP<const map_type> Ccolmap;
1304  RCP<const import_type> Bimport = Bview.origMatrix->getGraph()->getImporter();
1305  RCP<const import_type> Iimport = Bview.importMatrix.is_null() ?
1306  Teuchos::null : Bview.importMatrix->getGraph()->getImporter();
1307 
1308  // mfh 27 Sep 2016: Bcol2Ccol is a table that maps from local column
1309  // indices of B, to local column indices of C. (B and C have the
1310  // same number of columns.) The kernel uses this, instead of
1311  // copying the entire input matrix B and converting its column
1312  // indices to those of C.
1313  Array<LO> Bcol2Ccol(Bview.colMap->getNodeNumElements()), Icol2Ccol;
1314 
1315  if (Bview.importMatrix.is_null()) {
1316  // mfh 27 Sep 2016: B has no "remotes," so B and C have the same column Map.
1317  Cimport = Bimport;
1318  Ccolmap = Bview.colMap;
1319  // Bcol2Ccol is trivial
1320  for (size_t i = 0; i < Bview.colMap->getNodeNumElements(); i++)
1321  Bcol2Ccol[i] = Teuchos::as<LO>(i);
1322 
1323  } else {
1324  // mfh 27 Sep 2016: B has "remotes," so we need to build the
1325  // column Map of C, as well as C's Import object (from its domain
1326  // Map to its column Map). C's column Map is the union of the
1327  // column Maps of (the local part of) B, and the "remote" part of
1328  // B. Ditto for the Import. We have optimized this "setUnion"
1329  // operation on Import objects and Maps.
1330 
1331  // Choose the right variant of setUnion
1332  if (!Bimport.is_null() && !Iimport.is_null())
1333  Cimport = Bimport->setUnion(*Iimport);
1334 
1335  else if (!Bimport.is_null() && Iimport.is_null())
1336  Cimport = Bimport->setUnion();
1337 
1338  else if (Bimport.is_null() && !Iimport.is_null())
1339  Cimport = Iimport->setUnion();
1340 
1341  else
1342  throw std::runtime_error("TpetraExt::MMM status of matrix importers is nonsensical");
1343 
1344  Ccolmap = Cimport->getTargetMap();
1345 
1346  // FIXME (mfh 27 Sep 2016) This error check requires an all-reduce
1347  // in general. We should get rid of it in order to reduce
1348  // communication costs of sparse matrix-matrix multiply.
1349  TEUCHOS_TEST_FOR_EXCEPTION(!Cimport->getSourceMap()->isSameAs(*Bview.origMatrix->getDomainMap()),
1350  std::runtime_error, "Tpetra::MMM: Import setUnion messed with the DomainMap in an unfortunate way");
1351 
1352  // NOTE: This is not efficient and should be folded into setUnion
1353  //
1354  // mfh 27 Sep 2016: What the above comment means, is that the
1355  // setUnion operation on Import objects could also compute these
1356  // local index - to - local index look-up tables.
1357  Icol2Ccol.resize(Bview.importMatrix->getColMap()->getNodeNumElements());
1358  ArrayView<const GO> Bgid = Bview.origMatrix->getColMap()->getNodeElementList();
1359  ArrayView<const GO> Igid = Bview.importMatrix->getColMap()->getNodeElementList();
1360 
1361  for (size_t i = 0; i < Bview.origMatrix->getColMap()->getNodeNumElements(); i++)
1362  Bcol2Ccol[i] = Ccolmap->getLocalElement(Bgid[i]);
1363  for (size_t i = 0; i < Bview.importMatrix->getColMap()->getNodeNumElements(); i++)
1364  Icol2Ccol[i] = Ccolmap->getLocalElement(Igid[i]);
1365  }
1366 
1367 #ifdef HAVE_TPETRA_MMM_TIMINGS
1368  MM = rcp(new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string("MMM Newmatrix SerialCore"))));
1369 #endif
1370 
1371  // Sizes
1372  size_t m = Aview.origMatrix->getNodeNumRows();
1373  size_t n = Ccolmap->getNodeNumElements();
1374 
1375  // Get Data Pointers
1376  ArrayRCP<const size_t> Arowptr_RCP, Browptr_RCP, Irowptr_RCP;
1377  ArrayRCP<size_t> Crowptr_RCP;
1378  ArrayRCP<const LO> Acolind_RCP, Bcolind_RCP, Icolind_RCP;
1379  ArrayRCP<LO> Ccolind_RCP;
1380  ArrayRCP<const Scalar> Avals_RCP, Bvals_RCP, Ivals_RCP;
1381  ArrayRCP<SC> Cvals_RCP;
1382 
1383  // mfh 27 Sep 2016: "getAllValues" just gets the three CSR arrays
1384  // out of the CrsMatrix. This code computes A * (B_local +
1385  // B_remote), where B_local contains the locally owned rows of B,
1386  // and B_remote the (previously Import'ed) remote rows of B.
1387 
1388  Aview.origMatrix->getAllValues(Arowptr_RCP, Acolind_RCP, Avals_RCP);
1389  Bview.origMatrix->getAllValues(Browptr_RCP, Bcolind_RCP, Bvals_RCP);
1390  if (!Bview.importMatrix.is_null())
1391  Bview.importMatrix->getAllValues(Irowptr_RCP, Icolind_RCP, Ivals_RCP);
1392 
1393  // mfh 27 Sep 2016: Remark below "For efficiency" refers to an issue
1394  // where Teuchos::ArrayRCP::operator[] may be slower than
1395  // Teuchos::ArrayView::operator[].
1396 
1397  // For efficiency
1398  ArrayView<const size_t> Arowptr, Browptr, Irowptr;
1399  ArrayView<const LO> Acolind, Bcolind, Icolind;
1400  ArrayView<const SC> Avals, Bvals, Ivals;
1401  ArrayView<size_t> Crowptr;
1402  ArrayView<LO> Ccolind;
1403  ArrayView<SC> Cvals;
1404  Arowptr = Arowptr_RCP(); Acolind = Acolind_RCP(); Avals = Avals_RCP();
1405  Browptr = Browptr_RCP(); Bcolind = Bcolind_RCP(); Bvals = Bvals_RCP();
1406  if (!Bview.importMatrix.is_null()) {
1407  Irowptr = Irowptr_RCP(); Icolind = Icolind_RCP(); Ivals = Ivals_RCP();
1408  }
1409 
1410  // Classic csr assembly (low memory edition)
1411  //
1412  // mfh 27 Sep 2016: C_estimate_nnz does not promise an upper bound.
1413  // The method loops over rows of A, and may resize after processing
1414  // each row. Chris Siefert says that this reflects experience in
1415  // ML; for the non-threaded case, ML found it faster to spend less
1416  // effort on estimation and risk an occasional reallocation.
1417  size_t CSR_alloc = std::max(C_estimate_nnz(*Aview.origMatrix, *Bview.origMatrix), n);
1418  Crowptr_RCP.resize(m+1); Crowptr = Crowptr_RCP();
1419  Ccolind_RCP.resize(CSR_alloc); Ccolind = Ccolind_RCP();
1420  Cvals_RCP.resize(CSR_alloc); Cvals = Cvals_RCP();
1421 
1422  // mfh 27 Sep 2016: Construct tables that map from local column
1423  // indices of A, to local row indices of either B_local (the locally
1424  // owned part of B), or B_remote (the "imported" remote part of B).
1425  //
1426  // For column index Aik in row i of A, if the corresponding row of B
1427  // exists in the local part of B ("orig") (which I'll call B_local),
1428  // then targetMapToOrigRow[Aik] is the local index of that row of B.
1429  // Otherwise, targetMapToOrigRow[Aik] is "invalid" (a flag value).
1430  //
1431  // For column index Aik in row i of A, if the corresponding row of B
1432  // exists in the remote part of B ("Import") (which I'll call
1433  // B_remote), then targetMapToImportRow[Aik] is the local index of
1434  // that row of B. Otherwise, targetMapToOrigRow[Aik] is "invalid"
1435  // (a flag value).
1436 
1437  // Run through all the hash table lookups once and for all
1438  Array<LO> targetMapToOrigRow (Aview.colMap->getNodeNumElements(), LO_INVALID);
1439  Array<LO> targetMapToImportRow(Aview.colMap->getNodeNumElements(), LO_INVALID);
1440 
1441  for (LO i = Aview.colMap->getMinLocalIndex(); i <= Aview.colMap->getMaxLocalIndex(); i++) {
1442  LO B_LID = Bview.origMatrix->getRowMap()->getLocalElement(Aview.colMap->getGlobalElement(i));
1443  if (B_LID != LO_INVALID) {
1444  targetMapToOrigRow[i] = B_LID;
1445  } else {
1446  LO I_LID = Bview.importMatrix->getRowMap()->getLocalElement(Aview.colMap->getGlobalElement(i));
1447  targetMapToImportRow[i] = I_LID;
1448  }
1449  }
1450 
1451  const SC SC_ZERO = Teuchos::ScalarTraits<Scalar>::zero();
1452 
1453  // mfh 27 Sep 2016: The c_status array is an implementation detail
1454  // of the local sparse matrix-matrix multiply routine.
1455 
1456  // The status array will contain the index into colind where this entry was last deposited.
1457  // c_status[i] < CSR_ip - not in the row yet
1458  // c_status[i] >= CSR_ip - this is the entry where you can find the data
1459  // We start with this filled with INVALID's indicating that there are no entries yet.
1460  // Sadly, this complicates the code due to the fact that size_t's are unsigned.
1461  size_t INVALID = Teuchos::OrdinalTraits<size_t>::invalid();
1462  Array<size_t> c_status(n, ST_INVALID);
1463 
1464  // mfh 27 Sep 2016: Here is the local sparse matrix-matrix multiply
1465  // routine. The routine computes C := A * (B_local + B_remote).
1466  //
1467  // For column index Aik in row i of A, targetMapToOrigRow[Aik] tells
1468  // you whether the corresponding row of B belongs to B_local
1469  // ("orig") or B_remote ("Import").
1470 
1471  // For each row of A/C
1472  size_t CSR_ip = 0, OLD_ip = 0;
1473  for (size_t i = 0; i < m; i++) {
1474  // mfh 27 Sep 2016: m is the number of rows in the input matrix A
1475  // on the calling process.
1476  Crowptr[i] = CSR_ip;
1477 
1478  // mfh 27 Sep 2016: For each entry of A in the current row of A
1479  for (size_t k = Arowptr[i]; k < Arowptr[i+1]; k++) {
1480  LO Aik = Acolind[k]; // local column index of current entry of A
1481  SC Aval = Avals[k]; // value of current entry of A
1482  if (Aval == SC_ZERO)
1483  continue; // skip explicitly stored zero values in A
1484 
1485  if (targetMapToOrigRow[Aik] != LO_INVALID) {
1486  // mfh 27 Sep 2016: If the entry of targetMapToOrigRow
1487  // corresponding to the current entry of A is populated, then
1488  // the corresponding row of B is in B_local (i.e., it lives on
1489  // the calling process).
1490 
1491  // Local matrix
1492  size_t Bk = Teuchos::as<size_t>(targetMapToOrigRow[Aik]);
1493 
1494  // mfh 27 Sep 2016: Go through all entries in that row of B_local.
1495  for (size_t j = Browptr[Bk]; j < Browptr[Bk+1]; ++j) {
1496  LO Bkj = Bcolind[j];
1497  LO Cij = Bcol2Ccol[Bkj];
1498 
1499  if (c_status[Cij] == INVALID || c_status[Cij] < OLD_ip) {
1500  // New entry
1501  c_status[Cij] = CSR_ip;
1502  Ccolind[CSR_ip] = Cij;
1503  Cvals[CSR_ip] = Aval*Bvals[j];
1504  CSR_ip++;
1505 
1506  } else {
1507  Cvals[c_status[Cij]] += Aval*Bvals[j];
1508  }
1509  }
1510 
1511  } else {
1512  // mfh 27 Sep 2016: If the entry of targetMapToOrigRow
1513  // corresponding to the current entry of A NOT populated (has
1514  // a flag "invalid" value), then the corresponding row of B is
1515  // in B_local (i.e., it lives on the calling process).
1516 
1517  // Remote matrix
1518  size_t Ik = Teuchos::as<size_t>(targetMapToImportRow[Aik]);
1519  for (size_t j = Irowptr[Ik]; j < Irowptr[Ik+1]; ++j) {
1520  LO Ikj = Icolind[j];
1521  LO Cij = Icol2Ccol[Ikj];
1522 
1523  if (c_status[Cij] == INVALID || c_status[Cij] < OLD_ip){
1524  // New entry
1525  c_status[Cij] = CSR_ip;
1526  Ccolind[CSR_ip] = Cij;
1527  Cvals[CSR_ip] = Aval*Ivals[j];
1528  CSR_ip++;
1529 
1530  } else {
1531  Cvals[c_status[Cij]] += Aval*Ivals[j];
1532  }
1533  }
1534  }
1535  }
1536 
1537  // Resize for next pass if needed
1538  if (CSR_ip + n > CSR_alloc) {
1539  CSR_alloc *= 2;
1540  Ccolind_RCP.resize(CSR_alloc); Ccolind = Ccolind_RCP();
1541  Cvals_RCP.resize(CSR_alloc); Cvals = Cvals_RCP();
1542  }
1543  OLD_ip = CSR_ip;
1544  }
1545 
1546  Crowptr[m] = CSR_ip;
1547 
1548  // Downward resize
1549  Cvals_RCP .resize(CSR_ip);
1550  Ccolind_RCP.resize(CSR_ip);
1551 
1552 #ifdef HAVE_TPETRA_MMM_TIMINGS
1553  MM = rcp(new TimeMonitor (*TimeMonitor::getNewTimer(prefix_mmm + std::string("MMM Newmatrix Final Sort"))));
1554 #endif
1555 
1556  // Replace the column map
1557  //
1558  // mfh 27 Sep 2016: We do this because C was originally created
1559  // without a column Map. Now we have its column Map.
1560  C.replaceColMap(Ccolmap);
1561 
1562  // Final sort & set of CRS arrays
1563  //
1564  // TODO (mfh 27 Sep 2016) Will the thread-parallel "local" sparse
1565  // matrix-matrix multiply routine sort the entries for us?
1566  Import_Util::sortCrsEntries(Crowptr_RCP(), Ccolind_RCP(), Cvals_RCP());
1567  // mfh 27 Sep 2016: This just sets pointers.
1568  C.setAllValues(Crowptr_RCP, Ccolind_RCP, Cvals_RCP);
1569 
1570 #ifdef HAVE_TPETRA_MMM_TIMINGS
1571  MM = rcp(new TimeMonitor (*TimeMonitor::getNewTimer(prefix_mmm + std::string("MMM Newmatrix ESFC"))));
1572 #endif
1573 
1574  // Final FillComplete
1575  //
1576  // mfh 27 Sep 2016: So-called "expert static fill complete" bypasses
1577  // Import (from domain Map to column Map) construction (which costs
1578  // lots of communication) by taking the previously constructed
1579  // Import object. We should be able to do this without interfering
1580  // with the implementation of the local part of sparse matrix-matrix
1581  // multply above.
1582  C.expertStaticFillComplete(Bview. origMatrix->getDomainMap(), Aview. origMatrix->getRangeMap(), Cimport);
1583 }
1584 
1585 
1586 // Kernel method for computing the local portion of C = A*B
1587 template<class Scalar,
1588  class LocalOrdinal,
1589  class GlobalOrdinal,
1590  class Node>
1591 void mult_A_B_reuse(
1592  CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Aview,
1593  CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Bview,
1594  CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>& C,
1595  const std::string& label,
1596  const Teuchos::RCP<Teuchos::ParameterList>& params)
1597 {
1598  using Teuchos::Array;
1599  using Teuchos::ArrayRCP;
1600  using Teuchos::ArrayView;
1601  using Teuchos::RCP;
1602  using Teuchos::rcp;
1603 
1604  typedef Scalar SC;
1605  typedef LocalOrdinal LO;
1606  typedef GlobalOrdinal GO;
1607  typedef Node NO;
1608 
1609  typedef Import<LO,GO,NO> import_type;
1610  typedef Map<LO,GO,NO> map_type;
1611 
1612 #ifdef HAVE_TPETRA_MMM_TIMINGS
1613  std::string prefix_mmm = std::string("TpetraExt ") + label + std::string(": ");
1614  using Teuchos::TimeMonitor;
1615  RCP<TimeMonitor> MM = rcp(new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string("MMM Reuse Cmap"))));
1616 #endif
1617  size_t ST_INVALID = Teuchos::OrdinalTraits<LO>::invalid();
1618  LO LO_INVALID = Teuchos::OrdinalTraits<LO>::invalid();
1619 
1620  // Build the final importer / column map, hash table lookups for C
1621  RCP<const import_type> Cimport = C.getGraph()->getImporter();
1622  RCP<const map_type> Ccolmap = C.getColMap();
1623 
1624  Array<LO> Bcol2Ccol(Bview.colMap->getNodeNumElements()), Icol2Ccol;
1625  {
1626  // Bcol2Col may not be trivial, as Ccolmap is compressed during fillComplete in newmatrix
1627  // So, column map of C may be a strict subset of the column map of B
1628  ArrayView<const GO> Bgid = Bview.origMatrix->getColMap()->getNodeElementList();
1629  for (size_t i = 0; i < Bview.origMatrix->getColMap()->getNodeNumElements(); i++)
1630  Bcol2Ccol[i] = Ccolmap->getLocalElement(Bgid[i]);
1631 
1632  if (!Bview.importMatrix.is_null()) {
1633  TEUCHOS_TEST_FOR_EXCEPTION(!Cimport->getSourceMap()->isSameAs(*Bview.origMatrix->getDomainMap()),
1634  std::runtime_error, "Tpetra::MMM: Import setUnion messed with the DomainMap in an unfortunate way");
1635 
1636  Icol2Ccol.resize(Bview.importMatrix->getColMap()->getNodeNumElements());
1637  ArrayView<const GO> Igid = Bview.importMatrix->getColMap()->getNodeElementList();
1638  for (size_t i = 0; i < Bview.importMatrix->getColMap()->getNodeNumElements(); i++)
1639  Icol2Ccol[i] = Ccolmap->getLocalElement(Igid[i]);
1640  }
1641  }
1642 
1643 #ifdef HAVE_TPETRA_MMM_TIMINGS
1644  MM = rcp(new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string("MMM Reuse SerialCore"))));
1645 #endif
1646 
1647  // Sizes
1648  size_t m = Aview.origMatrix->getNodeNumRows();
1649  size_t n = Ccolmap->getNodeNumElements();
1650 
1651  // Get Data Pointers
1652  ArrayRCP<const size_t> Arowptr_RCP, Browptr_RCP, Irowptr_RCP, Crowptr_RCP;
1653  ArrayRCP<const LO> Acolind_RCP, Bcolind_RCP, Icolind_RCP, Ccolind_RCP;
1654  ArrayRCP<const SC> Avals_RCP, Bvals_RCP, Ivals_RCP, Cvals_RCP;
1655 
1656  Aview.origMatrix->getAllValues(Arowptr_RCP, Acolind_RCP, Avals_RCP);
1657  Bview.origMatrix->getAllValues(Browptr_RCP, Bcolind_RCP, Bvals_RCP);
1658  if (!Bview.importMatrix.is_null())
1659  Bview.importMatrix->getAllValues(Irowptr_RCP, Icolind_RCP, Ivals_RCP);
1660  C.getAllValues(Crowptr_RCP, Ccolind_RCP, Cvals_RCP);
1661 
1662  // For efficiency
1663  ArrayView<const size_t> Arowptr, Browptr, Irowptr, Crowptr;
1664  ArrayView<const LO> Acolind, Bcolind, Icolind, Ccolind;
1665  ArrayView<const SC> Avals, Bvals, Ivals;
1666  ArrayView<SC> Cvals;
1667  Arowptr = Arowptr_RCP(); Acolind = Acolind_RCP(); Avals = Avals_RCP();
1668  Browptr = Browptr_RCP(); Bcolind = Bcolind_RCP(); Bvals = Bvals_RCP();
1669  if (!Bview.importMatrix.is_null()) {
1670  Irowptr = Irowptr_RCP(); Icolind = Icolind_RCP(); Ivals = Ivals_RCP();
1671  }
1672  Crowptr = Crowptr_RCP(); Ccolind = Ccolind_RCP(); Cvals = (Teuchos::arcp_const_cast<SC>(Cvals_RCP))();
1673 
1674  // Run through all the hash table lookups once and for all
1675  Array<LO> targetMapToOrigRow (Aview.colMap->getNodeNumElements(), LO_INVALID);
1676  Array<LO> targetMapToImportRow(Aview.colMap->getNodeNumElements(), LO_INVALID);
1677 
1678  for (LO i = Aview.colMap->getMinLocalIndex(); i <= Aview.colMap->getMaxLocalIndex(); i++) {
1679  LO B_LID = Bview.origMatrix->getRowMap()->getLocalElement(Aview.colMap->getGlobalElement(i));
1680  if (B_LID != LO_INVALID) {
1681  targetMapToOrigRow[i] = B_LID;
1682  } else {
1683  LO I_LID = Bview.importMatrix->getRowMap()->getLocalElement(Aview.colMap->getGlobalElement(i));
1684  targetMapToImportRow[i] = I_LID;
1685  }
1686  }
1687 
1688  const SC SC_ZERO = Teuchos::ScalarTraits<Scalar>::zero();
1689 
1690  // The status array will contain the index into colind where this entry was last deposited.
1691  // c_status[i] < CSR_ip - not in the row yet
1692  // c_status[i] >= CSR_ip - this is the entry where you can find the data
1693  // We start with this filled with INVALID's indicating that there are no entries yet.
1694  // Sadly, this complicates the code due to the fact that size_t's are unsigned.
1695  Array<size_t> c_status(n, ST_INVALID);
1696 
1697  // For each row of A/C
1698  size_t CSR_ip = 0, OLD_ip = 0;
1699  for (size_t i = 0; i < m; i++) {
1700 
1701  // First fill the c_status array w/ locations where we're allowed to
1702  // generate nonzeros for this row
1703  OLD_ip = Crowptr[i];
1704  CSR_ip = Crowptr[i+1];
1705  for (size_t k = OLD_ip; k < CSR_ip; k++) {
1706  c_status[Ccolind[k]] = k;
1707 
1708  // Reset values in the row of C
1709  Cvals[k] = SC_ZERO;
1710  }
1711 
1712  for (size_t k = Arowptr[i]; k < Arowptr[i+1]; k++) {
1713  LO Aik = Acolind[k];
1714  SC Aval = Avals[k];
1715  if (Aval == SC_ZERO)
1716  continue;
1717 
1718  if (targetMapToOrigRow[Aik] != LO_INVALID) {
1719  // Local matrix
1720  size_t Bk = Teuchos::as<size_t>(targetMapToOrigRow[Aik]);
1721 
1722  for (size_t j = Browptr[Bk]; j < Browptr[Bk+1]; ++j) {
1723  LO Bkj = Bcolind[j];
1724  LO Cij = Bcol2Ccol[Bkj];
1725 
1726  TEUCHOS_TEST_FOR_EXCEPTION(c_status[Cij] < OLD_ip || c_status[Cij] >= CSR_ip,
1727  std::runtime_error, "Trying to insert a new entry (" << i << "," << Cij << ") into a static graph " <<
1728  "(c_status = " << c_status[Cij] << " of [" << OLD_ip << "," << CSR_ip << "))");
1729 
1730  Cvals[c_status[Cij]] += Aval * Bvals[j];
1731  }
1732 
1733  } else {
1734  // Remote matrix
1735  size_t Ik = Teuchos::as<size_t>(targetMapToImportRow[Aik]);
1736  for (size_t j = Irowptr[Ik]; j < Irowptr[Ik+1]; ++j) {
1737  LO Ikj = Icolind[j];
1738  LO Cij = Icol2Ccol[Ikj];
1739 
1740  TEUCHOS_TEST_FOR_EXCEPTION(c_status[Cij] < OLD_ip || c_status[Cij] >= CSR_ip,
1741  std::runtime_error, "Trying to insert a new entry (" << i << "," << Cij << ") into a static graph " <<
1742  "(c_status = " << c_status[Cij] << " of [" << OLD_ip << "," << CSR_ip << "))");
1743 
1744  Cvals[c_status[Cij]] += Aval * Ivals[j];
1745  }
1746  }
1747  }
1748  }
1749 
1750  C.fillComplete(C.getDomainMap(), C.getRangeMap());
1751 }
1752 
1753 
1754 // Kernel method for computing the local portion of C = (I-omega D^{-1} A)*B
1755 template<class Scalar,
1756  class LocalOrdinal,
1757  class GlobalOrdinal,
1758  class Node>
1759 void jacobi_A_B_newmatrix(
1760  Scalar omega,
1761  const Vector<Scalar, LocalOrdinal, GlobalOrdinal, Node> & Dinv,
1762  CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Aview,
1763  CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Bview,
1764  CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>& C,
1765  const std::string& label,
1766  const Teuchos::RCP<Teuchos::ParameterList>& params)
1767 {
1768  using Teuchos::Array;
1769  using Teuchos::ArrayRCP;
1770  using Teuchos::ArrayView;
1771  using Teuchos::RCP;
1772  using Teuchos::rcp;
1773 
1774  typedef Scalar SC;
1775  typedef LocalOrdinal LO;
1776  typedef GlobalOrdinal GO;
1777  typedef Node NO;
1778 
1779  typedef Import<LO,GO,NO> import_type;
1780  typedef Map<LO,GO,NO> map_type;
1781 
1782 #ifdef HAVE_TPETRA_MMM_TIMINGS
1783  std::string prefix_mmm = std::string("TpetraExt ") + label + std::string(": ");
1784  using Teuchos::TimeMonitor;
1785  RCP<TimeMonitor> MM = rcp(new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string("Jacobi M5 Cmap"))));
1786 #endif
1787  size_t ST_INVALID = Teuchos::OrdinalTraits<LO>::invalid();
1788  LO LO_INVALID = Teuchos::OrdinalTraits<LO>::invalid();
1789 
1790 
1791  // Build the final importer / column map, hash table lookups for C
1792  RCP<const import_type> Cimport;
1793  RCP<const map_type> Ccolmap;
1794  RCP<const import_type> Bimport = Bview.origMatrix->getGraph()->getImporter();
1795  RCP<const import_type> Iimport = Bview.importMatrix.is_null() ? Teuchos::null : Bview.importMatrix->getGraph()->getImporter();
1796 
1797  // mfh 27 Sep 2016: Bcol2Ccol is a table that maps from local column
1798  // indices of B, to local column indices of C. (B and C have the
1799  // same number of columns.) The kernel uses this, instead of
1800  // copying the entire input matrix B and converting its column
1801  // indices to those of C.
1802  Array<LO> Bcol2Ccol(Bview.colMap->getNodeNumElements()), Icol2Ccol;
1803 
1804  if (Bview.importMatrix.is_null()) {
1805  // mfh 27 Sep 2016: B has no "remotes," so B and C have the same column Map.
1806  Cimport = Bimport;
1807  Ccolmap = Bview.colMap;
1808  // Bcol2Ccol is trivial
1809  for (size_t i = 0; i < Bview.colMap->getNodeNumElements(); i++)
1810  Bcol2Ccol[i] = Teuchos::as<LO>(i);
1811 
1812  } else {
1813  // mfh 27 Sep 2016: B has "remotes," so we need to build the
1814  // column Map of C, as well as C's Import object (from its domain
1815  // Map to its column Map). C's column Map is the union of the
1816  // column Maps of (the local part of) B, and the "remote" part of
1817  // B. Ditto for the Import. We have optimized this "setUnion"
1818  // operation on Import objects and Maps.
1819 
1820  // Choose the right variant of setUnion
1821  if (!Bimport.is_null() && !Iimport.is_null()){
1822  Cimport = Bimport->setUnion(*Iimport);
1823  Ccolmap = Cimport->getTargetMap();
1824 
1825  } else if (!Bimport.is_null() && Iimport.is_null()) {
1826  Cimport = Bimport->setUnion();
1827 
1828  } else if(Bimport.is_null() && !Iimport.is_null()) {
1829  Cimport = Iimport->setUnion();
1830 
1831  } else
1832  throw std::runtime_error("TpetraExt::Jacobi status of matrix importers is nonsensical");
1833 
1834  Ccolmap = Cimport->getTargetMap();
1835 
1836  TEUCHOS_TEST_FOR_EXCEPTION(!Cimport->getSourceMap()->isSameAs(*Bview.origMatrix->getDomainMap()),
1837  std::runtime_error, "Tpetra:Jacobi Import setUnion messed with the DomainMap in an unfortunate way");
1838 
1839  // NOTE: This is not efficient and should be folded into setUnion
1840  //
1841  // mfh 27 Sep 2016: What the above comment means, is that the
1842  // setUnion operation on Import objects could also compute these
1843  // local index - to - local index look-up tables.
1844  Icol2Ccol.resize(Bview.importMatrix->getColMap()->getNodeNumElements());
1845  ArrayView<const GO> Bgid = Bview.origMatrix->getColMap()->getNodeElementList();
1846  ArrayView<const GO> Igid = Bview.importMatrix->getColMap()->getNodeElementList();
1847 
1848  for (size_t i = 0; i < Bview.origMatrix->getColMap()->getNodeNumElements(); i++)
1849  Bcol2Ccol[i] = Ccolmap->getLocalElement(Bgid[i]);
1850  for (size_t i = 0; i < Bview.importMatrix->getColMap()->getNodeNumElements(); i++)
1851  Icol2Ccol[i] = Ccolmap->getLocalElement(Igid[i]);
1852  }
1853 
1854 #ifdef HAVE_TPETRA_MMM_TIMINGS
1855  MM = rcp(new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string("Jacobi Newmatrix SerialCore"))));
1856 #endif
1857 
1858  // Sizes
1859  size_t m = Aview.origMatrix->getNodeNumRows();
1860  size_t n = Ccolmap->getNodeNumElements();
1861 
1862  // Get Data Pointers
1863  ArrayRCP<const size_t> Arowptr_RCP, Browptr_RCP, Irowptr_RCP;
1864  ArrayRCP<size_t> Crowptr_RCP;
1865  ArrayRCP<const LO> Acolind_RCP, Bcolind_RCP, Icolind_RCP;
1866  ArrayRCP<LO> Ccolind_RCP;
1867  ArrayRCP<const SC> Avals_RCP, Bvals_RCP, Ivals_RCP;
1868  ArrayRCP<SC> Cvals_RCP;
1869  ArrayRCP<const SC> Dvals_RCP;
1870 
1871  // mfh 27 Sep 2016: "getAllValues" just gets the three CSR arrays
1872  // out of the CrsMatrix. This code computes A * (B_local +
1873  // B_remote), where B_local contains the locally owned rows of B,
1874  // and B_remote the (previously Import'ed) remote rows of B.
1875 
1876  Aview.origMatrix->getAllValues(Arowptr_RCP, Acolind_RCP, Avals_RCP);
1877  Bview.origMatrix->getAllValues(Browptr_RCP, Bcolind_RCP, Bvals_RCP);
1878  if (!Bview.importMatrix.is_null())
1879  Bview.importMatrix->getAllValues(Irowptr_RCP, Icolind_RCP, Ivals_RCP);
1880 
1881  // mfh 27 Sep 2016: The "Jacobi" case scales by the inverse of a
1882  // diagonal matrix.
1883  Dvals_RCP = Dinv.getData();
1884 
1885  // mfh 27 Sep 2016: Remark below "For efficiency" refers to an issue
1886  // where Teuchos::ArrayRCP::operator[] may be slower than
1887  // Teuchos::ArrayView::operator[].
1888 
1889  // For efficiency
1890  ArrayView<const size_t> Arowptr, Browptr, Irowptr;
1891  ArrayView<const LO> Acolind, Bcolind, Icolind;
1892  ArrayView<const SC> Avals, Bvals, Ivals;
1893  ArrayView<size_t> Crowptr;
1894  ArrayView<LO> Ccolind;
1895  ArrayView<SC> Cvals;
1896  ArrayView<const SC> Dvals;
1897  Arowptr = Arowptr_RCP(); Acolind = Acolind_RCP(); Avals = Avals_RCP();
1898  Browptr = Browptr_RCP(); Bcolind = Bcolind_RCP(); Bvals = Bvals_RCP();
1899  if (!Bview.importMatrix.is_null()) {
1900  Irowptr = Irowptr_RCP(); Icolind = Icolind_RCP(); Ivals = Ivals_RCP();
1901  }
1902  Dvals = Dvals_RCP();
1903 
1904  // The status array will contain the index into colind where this entry was last deposited.
1905  // c_status[i] < CSR_ip - not in the row yet.
1906  // c_status[i] >= CSR_ip, this is the entry where you can find the data
1907  // We start with this filled with INVALID's indicating that there are no entries yet.
1908  // Sadly, this complicates the code due to the fact that size_t's are unsigned.
1909  size_t INVALID = Teuchos::OrdinalTraits<size_t>::invalid();
1910  Array<size_t> c_status(n, ST_INVALID);
1911 
1912  // Classic csr assembly (low memory edition)
1913  //
1914  // mfh 27 Sep 2016: C_estimate_nnz does not promise an upper bound.
1915  // The method loops over rows of A, and may resize after processing
1916  // each row. Chris Siefert says that this reflects experience in
1917  // ML; for the non-threaded case, ML found it faster to spend less
1918  // effort on estimation and risk an occasional reallocation.
1919  size_t CSR_alloc = std::max(C_estimate_nnz(*Aview.origMatrix, *Bview.origMatrix), n);
1920  size_t CSR_ip = 0, OLD_ip = 0;
1921  Crowptr_RCP.resize(m+1); Crowptr = Crowptr_RCP();
1922  Ccolind_RCP.resize(CSR_alloc); Ccolind = Ccolind_RCP();
1923  Cvals_RCP.resize(CSR_alloc); Cvals = Cvals_RCP();
1924 
1925  // mfh 27 Sep 2016: Construct tables that map from local column
1926  // indices of A, to local row indices of either B_local (the locally
1927  // owned part of B), or B_remote (the "imported" remote part of B).
1928  //
1929  // For column index Aik in row i of A, if the corresponding row of B
1930  // exists in the local part of B ("orig") (which I'll call B_local),
1931  // then targetMapToOrigRow[Aik] is the local index of that row of B.
1932  // Otherwise, targetMapToOrigRow[Aik] is "invalid" (a flag value).
1933  //
1934  // For column index Aik in row i of A, if the corresponding row of B
1935  // exists in the remote part of B ("Import") (which I'll call
1936  // B_remote), then targetMapToImportRow[Aik] is the local index of
1937  // that row of B. Otherwise, targetMapToOrigRow[Aik] is "invalid"
1938  // (a flag value).
1939 
1940  // Run through all the hash table lookups once and for all
1941  Array<LO> targetMapToOrigRow (Aview.colMap->getNodeNumElements(), LO_INVALID);
1942  Array<LO> targetMapToImportRow(Aview.colMap->getNodeNumElements(), LO_INVALID);
1943 
1944  for (LO i = Aview.colMap->getMinLocalIndex(); i <= Aview.colMap->getMaxLocalIndex(); i++) {
1945  LO B_LID = Bview.origMatrix->getRowMap()->getLocalElement(Aview.colMap->getGlobalElement(i));
1946  if (B_LID != LO_INVALID) {
1947  targetMapToOrigRow[i] = B_LID;
1948  } else {
1949  LO I_LID = Bview.importMatrix->getRowMap()->getLocalElement(Aview.colMap->getGlobalElement(i));
1950  targetMapToImportRow[i] = I_LID;
1951  }
1952  }
1953 
1954  const SC SC_ZERO = Teuchos::ScalarTraits<Scalar>::zero();
1955 
1956  // mfh 27 Sep 2016: Here is the local sparse matrix-matrix multiply
1957  // routine. The routine computes
1958  //
1959  // C := (I - omega * D^{-1} * A) * (B_local + B_remote)).
1960  //
1961  // This corresponds to one sweep of (weighted) Jacobi.
1962  //
1963  // For column index Aik in row i of A, targetMapToOrigRow[Aik] tells
1964  // you whether the corresponding row of B belongs to B_local
1965  // ("orig") or B_remote ("Import").
1966 
1967  // For each row of A/C
1968  for (size_t i = 0; i < m; i++) {
1969  // mfh 27 Sep 2016: m is the number of rows in the input matrix A
1970  // on the calling process.
1971  Crowptr[i] = CSR_ip;
1972  SC Dval = Dvals[i];
1973 
1974  // Entries of B
1975  for (size_t j = Browptr[i]; j < Browptr[i+1]; j++) {
1976  Scalar Bval = Bvals[j];
1977  if (Bval == SC_ZERO)
1978  continue;
1979  LO Bij = Bcolind[j];
1980  LO Cij = Bcol2Ccol[Bij];
1981 
1982  // Assume no repeated entries in B
1983  c_status[Cij] = CSR_ip;
1984  Ccolind[CSR_ip] = Cij;
1985  Cvals[CSR_ip] = Bvals[j];
1986  CSR_ip++;
1987  }
1988 
1989  // Entries of -omega * Dinv * A * B
1990  for (size_t k = Arowptr[i]; k < Arowptr[i+1]; k++) {
1991  LO Aik = Acolind[k];
1992  SC Aval = Avals[k];
1993  if (Aval == SC_ZERO)
1994  continue;
1995 
1996  if (targetMapToOrigRow[Aik] != LO_INVALID) {
1997  // Local matrix
1998  size_t Bk = Teuchos::as<size_t>(targetMapToOrigRow[Aik]);
1999 
2000  for (size_t j = Browptr[Bk]; j < Browptr[Bk+1]; ++j) {
2001  LO Bkj = Bcolind[j];
2002  LO Cij = Bcol2Ccol[Bkj];
2003 
2004  if (c_status[Cij] == INVALID || c_status[Cij] < OLD_ip) {
2005  // New entry
2006  c_status[Cij] = CSR_ip;
2007  Ccolind[CSR_ip] = Cij;
2008  Cvals[CSR_ip] = - omega * Dval* Aval * Bvals[j];
2009  CSR_ip++;
2010 
2011  } else {
2012  Cvals[c_status[Cij]] -= omega * Dval* Aval * Bvals[j];
2013  }
2014  }
2015 
2016  } else {
2017  // Remote matrix
2018  size_t Ik = Teuchos::as<size_t>(targetMapToImportRow[Aik]);
2019  for (size_t j = Irowptr[Ik]; j < Irowptr[Ik+1]; ++j) {
2020  LO Ikj = Icolind[j];
2021  LO Cij = Icol2Ccol[Ikj];
2022 
2023  if (c_status[Cij] == INVALID || c_status[Cij] < OLD_ip) {
2024  // New entry
2025  c_status[Cij] = CSR_ip;
2026  Ccolind[CSR_ip] = Cij;
2027  Cvals[CSR_ip] = - omega * Dval* Aval * Ivals[j];
2028  CSR_ip++;
2029  } else {
2030  Cvals[c_status[Cij]] -= omega * Dval* Aval * Ivals[j];
2031  }
2032  }
2033  }
2034  }
2035 
2036  // Resize for next pass if needed
2037  if (CSR_ip + n > CSR_alloc) {
2038  CSR_alloc *= 2;
2039  Ccolind_RCP.resize(CSR_alloc); Ccolind = Ccolind_RCP();
2040  Cvals_RCP.resize(CSR_alloc); Cvals = Cvals_RCP();
2041  }
2042  OLD_ip = CSR_ip;
2043  }
2044 
2045  Crowptr[m] = CSR_ip;
2046 
2047  // Downward resize
2048  Cvals_RCP .resize(CSR_ip);
2049  Ccolind_RCP.resize(CSR_ip);
2050 
2051 
2052 #ifdef HAVE_TPETRA_MMM_TIMINGS
2053  MM = rcp(new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string("Jacobi Newmatrix Final Sort"))));
2054 #endif
2055 
2056  // Replace the column map
2057  //
2058  // mfh 27 Sep 2016: We do this because C was originally created
2059  // without a column Map. Now we have its column Map.
2060  C.replaceColMap(Ccolmap);
2061 
2062  // Final sort & set of CRS arrays
2063  //
2064  // TODO (mfh 27 Sep 2016) Will the thread-parallel "local" sparse
2065  // matrix-matrix multiply routine sort the entries for us?
2066  Import_Util::sortCrsEntries(Crowptr_RCP(), Ccolind_RCP(), Cvals_RCP());
2067  // mfh 27 Sep 2016: This just sets pointers.
2068  C.setAllValues(Crowptr_RCP, Ccolind_RCP, Cvals_RCP);
2069 
2070 #ifdef HAVE_TPETRA_MMM_TIMINGS
2071  MM = rcp(new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string("Jacobi Newmatrix ESFC"))));
2072 #endif
2073 
2074  // Final FillComplete
2075  //
2076  // mfh 27 Sep 2016: So-called "expert static fill complete" bypasses
2077  // Import (from domain Map to column Map) construction (which costs
2078  // lots of communication) by taking the previously constructed
2079  // Import object. We should be able to do this without interfering
2080  // with the implementation of the local part of sparse matrix-matrix
2081  // multply above.
2082  C.expertStaticFillComplete(Bview.origMatrix->getDomainMap(), Aview.origMatrix->getRangeMap(), Cimport);
2083 }
2084 
2085 
2086 // Kernel method for computing the local portion of C = (I-omega D^{-1} A)*B
2087 template<class Scalar,
2088  class LocalOrdinal,
2089  class GlobalOrdinal,
2090  class Node>
2091 void jacobi_A_B_reuse(
2092  Scalar omega,
2093  const Vector<Scalar, LocalOrdinal, GlobalOrdinal, Node> & Dinv,
2094  CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Aview,
2095  CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Bview,
2096  CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>& C,
2097  const std::string& label,
2098  const Teuchos::RCP<Teuchos::ParameterList>& params)
2099 {
2100  using Teuchos::Array;
2101  using Teuchos::ArrayRCP;
2102  using Teuchos::ArrayView;
2103  using Teuchos::RCP;
2104  using Teuchos::rcp;
2105 
2106  typedef Scalar SC;
2107  typedef LocalOrdinal LO;
2108  typedef GlobalOrdinal GO;
2109  typedef Node NO;
2110 
2111  typedef Import<LO,GO,NO> import_type;
2112  typedef Map<LO,GO,NO> map_type;
2113 
2114 #ifdef HAVE_TPETRA_MMM_TIMINGS
2115  std::string prefix_mmm = std::string("TpetraExt ") + label + std::string(": ");
2116  using Teuchos::TimeMonitor;
2117  RCP<TimeMonitor> MM = rcp(new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string("Jacobi Reuse Cmap"))));
2118 #endif
2119  size_t ST_INVALID = Teuchos::OrdinalTraits<LO>::invalid();
2120  LO LO_INVALID = Teuchos::OrdinalTraits<LO>::invalid();
2121 
2122  // Build the final importer / column map, hash table lookups for C
2123  RCP<const import_type> Cimport = C.getGraph()->getImporter();
2124  RCP<const map_type> Ccolmap = C.getColMap();
2125 
2126  Array<LO> Bcol2Ccol(Bview.colMap->getNodeNumElements()), Icol2Ccol;
2127 
2128  if (Bview.importMatrix.is_null()) {
2129  // Bcol2Ccol is trivial
2130  // This is possible. This situation is different from mult_A_B_reuse, as we
2131  // always add B
2132  for (size_t i = 0; i < Bview.colMap->getNodeNumElements(); i++)
2133  Bcol2Ccol[i] = Teuchos::as<LO>(i);
2134 
2135  } else {
2136  TEUCHOS_TEST_FOR_EXCEPTION(!Cimport->getSourceMap()->isSameAs(*Bview.origMatrix->getDomainMap()),
2137  std::runtime_error, "Tpetra:Jacobi Import setUnion messed with the DomainMap in an unfortunate way");
2138 
2139  // NOTE: This is not efficient and should be folded into setUnion
2140  Icol2Ccol.resize(Bview.importMatrix->getColMap()->getNodeNumElements());
2141  ArrayView<const GO> Bgid = Bview.origMatrix->getColMap()->getNodeElementList();
2142  ArrayView<const GO> Igid = Bview.importMatrix->getColMap()->getNodeElementList();
2143 
2144  for (size_t i = 0; i < Bview.origMatrix->getColMap()->getNodeNumElements(); i++)
2145  Bcol2Ccol[i] = Ccolmap->getLocalElement(Bgid[i]);
2146  for (size_t i = 0; i < Bview.importMatrix->getColMap()->getNodeNumElements(); i++)
2147  Icol2Ccol[i] = Ccolmap->getLocalElement(Igid[i]);
2148  }
2149 
2150 #ifdef HAVE_TPETRA_MMM_TIMINGS
2151  MM = rcp(new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string("Jacobi Reuse SerialCore"))));
2152 #endif
2153 
2154  // Sizes
2155  size_t m = Aview.origMatrix->getNodeNumRows();
2156  size_t n = Ccolmap->getNodeNumElements();
2157 
2158  // Get Data Pointers
2159  ArrayRCP<const size_t> Arowptr_RCP, Browptr_RCP, Irowptr_RCP, Crowptr_RCP;
2160  ArrayRCP<const LO> Acolind_RCP, Bcolind_RCP, Icolind_RCP, Ccolind_RCP;
2161  ArrayRCP<const SC> Avals_RCP, Bvals_RCP, Ivals_RCP, Cvals_RCP, Dvals_RCP;
2162 
2163  Aview.origMatrix->getAllValues(Arowptr_RCP, Acolind_RCP, Avals_RCP);
2164  Bview.origMatrix->getAllValues(Browptr_RCP, Bcolind_RCP, Bvals_RCP);
2165  if (!Bview.importMatrix.is_null())
2166  Bview.importMatrix->getAllValues(Irowptr_RCP, Icolind_RCP, Ivals_RCP);
2167  C.getAllValues(Crowptr_RCP, Ccolind_RCP, Cvals_RCP);
2168  Dvals_RCP = Dinv.getData();
2169 
2170  // For efficiency
2171  ArrayView<const size_t> Arowptr, Browptr, Irowptr, Crowptr;
2172  ArrayView<const LO> Acolind, Bcolind, Icolind, Ccolind;
2173  ArrayView<const SC> Avals, Bvals, Ivals;
2174  ArrayView<SC> Cvals;
2175  ArrayView<const SC> Dvals;
2176  Arowptr = Arowptr_RCP(); Acolind = Acolind_RCP(); Avals = Avals_RCP();
2177  Browptr = Browptr_RCP(); Bcolind = Bcolind_RCP(); Bvals = Bvals_RCP();
2178  if (!Bview.importMatrix.is_null()) {
2179  Irowptr = Irowptr_RCP(); Icolind = Icolind_RCP(); Ivals = Ivals_RCP();
2180  }
2181  Crowptr = Crowptr_RCP(); Ccolind = Ccolind_RCP(); Cvals = (Teuchos::arcp_const_cast<SC>(Cvals_RCP))();
2182  Dvals = Dvals_RCP();
2183 
2184  // Run through all the hash table lookups once and for all
2185  Array<LO> targetMapToOrigRow (Aview.colMap->getNodeNumElements(), LO_INVALID);
2186  Array<LO> targetMapToImportRow(Aview.colMap->getNodeNumElements(), LO_INVALID);
2187 
2188  for (LO i = Aview.colMap->getMinLocalIndex(); i <= Aview.colMap->getMaxLocalIndex(); i++) {
2189  LO B_LID = Bview.origMatrix->getRowMap()->getLocalElement(Aview.colMap->getGlobalElement(i));
2190  if (B_LID != LO_INVALID) {
2191  targetMapToOrigRow[i] = B_LID;
2192  } else {
2193  LO I_LID = Bview.importMatrix->getRowMap()->getLocalElement(Aview.colMap->getGlobalElement(i));
2194  targetMapToImportRow[i] = I_LID;
2195  }
2196  }
2197 
2198  const SC SC_ZERO = Teuchos::ScalarTraits<Scalar>::zero();
2199 
2200  // The status array will contain the index into colind where this entry was last deposited.
2201  // c_status[i] < CSR_ip - not in the row yet
2202  // c_status[i] >= CSR_ip - this is the entry where you can find the data
2203  // We start with this filled with INVALID's indicating that there are no entries yet.
2204  // Sadly, this complicates the code due to the fact that size_t's are unsigned.
2205  Array<size_t> c_status(n, ST_INVALID);
2206 
2207  // For each row of A/C
2208  size_t CSR_ip = 0, OLD_ip = 0;
2209  for (size_t i = 0; i < m; i++) {
2210 
2211  // First fill the c_status array w/ locations where we're allowed to
2212  // generate nonzeros for this row
2213  OLD_ip = Crowptr[i];
2214  CSR_ip = Crowptr[i+1];
2215  for (size_t k = OLD_ip; k < CSR_ip; k++) {
2216  c_status[Ccolind[k]] = k;
2217 
2218  // Reset values in the row of C
2219  Cvals[k] = SC_ZERO;
2220  }
2221 
2222  SC Dval = Dvals[i];
2223 
2224  // Entries of B
2225  for (size_t j = Browptr[i]; j < Browptr[i+1]; j++) {
2226  Scalar Bval = Bvals[j];
2227  if (Bval == SC_ZERO)
2228  continue;
2229  LO Bij = Bcolind[j];
2230  LO Cij = Bcol2Ccol[Bij];
2231 
2232  TEUCHOS_TEST_FOR_EXCEPTION(c_status[Cij] < OLD_ip || c_status[Cij] >= CSR_ip,
2233  std::runtime_error, "Trying to insert a new entry into a static graph");
2234 
2235  Cvals[c_status[Cij]] = Bvals[j];
2236  }
2237 
2238  // Entries of -omega * Dinv * A * B
2239  for (size_t k = Arowptr[i]; k < Arowptr[i+1]; k++) {
2240  LO Aik = Acolind[k];
2241  SC Aval = Avals[k];
2242  if (Aval == SC_ZERO)
2243  continue;
2244 
2245  if (targetMapToOrigRow[Aik] != LO_INVALID) {
2246  // Local matrix
2247  size_t Bk = Teuchos::as<size_t>(targetMapToOrigRow[Aik]);
2248 
2249  for (size_t j = Browptr[Bk]; j < Browptr[Bk+1]; ++j) {
2250  LO Bkj = Bcolind[j];
2251  LO Cij = Bcol2Ccol[Bkj];
2252 
2253  TEUCHOS_TEST_FOR_EXCEPTION(c_status[Cij] < OLD_ip || c_status[Cij] >= CSR_ip,
2254  std::runtime_error, "Trying to insert a new entry into a static graph");
2255 
2256  Cvals[c_status[Cij]] -= omega * Dval* Aval * Bvals[j];
2257  }
2258 
2259  } else {
2260  // Remote matrix
2261  size_t Ik = Teuchos::as<size_t>(targetMapToImportRow[Aik]);
2262  for (size_t j = Irowptr[Ik]; j < Irowptr[Ik+1]; ++j) {
2263  LO Ikj = Icolind[j];
2264  LO Cij = Icol2Ccol[Ikj];
2265 
2266  TEUCHOS_TEST_FOR_EXCEPTION(c_status[Cij] < OLD_ip || c_status[Cij] >= CSR_ip,
2267  std::runtime_error, "Trying to insert a new entry into a static graph");
2268 
2269  Cvals[c_status[Cij]] -= omega * Dval* Aval * Ivals[j];
2270  }
2271  }
2272  }
2273  }
2274 
2275  C.fillComplete(C.getDomainMap(), C.getRangeMap());
2276 }
2277 
2278 
2279 template<class Scalar,
2280  class LocalOrdinal,
2281  class GlobalOrdinal,
2282  class Node>
2283 void import_and_extract_views(
2284  const CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>& A,
2285  Teuchos::RCP<const Map<LocalOrdinal, GlobalOrdinal, Node> > targetMap,
2286  CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Aview,
2287  Teuchos::RCP<const Import<LocalOrdinal, GlobalOrdinal, Node> > prototypeImporter,
2288  bool userAssertsThereAreNoRemotes,
2289  const std::string& label,
2290  const Teuchos::RCP<Teuchos::ParameterList>& params)
2291 {
2292  using Teuchos::Array;
2293  using Teuchos::ArrayView;
2294  using Teuchos::RCP;
2295  using Teuchos::rcp;
2296  using Teuchos::null;
2297 
2298  typedef Scalar SC;
2299  typedef LocalOrdinal LO;
2300  typedef GlobalOrdinal GO;
2301  typedef Node NO;
2302 
2303  typedef Map<LO,GO,NO> map_type;
2304  typedef Import<LO,GO,NO> import_type;
2305  typedef CrsMatrix<SC,LO,GO,NO> crs_matrix_type;
2306 
2307 #ifdef HAVE_TPETRA_MMM_TIMINGS
2308  std::string prefix_mmm = std::string("TpetraExt ") + label + std::string(": ");
2309  using Teuchos::TimeMonitor;
2310  RCP<Teuchos::TimeMonitor> MM = rcp(new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string("MMM I&X Alloc"))));
2311 #endif
2312 
2313  // The goal of this method is to populate the 'Aview' struct with views of the
2314  // rows of A, including all rows that correspond to elements in 'targetMap'.
2315  //
2316  // If targetMap includes local elements that correspond to remotely-owned rows
2317  // of A, then those remotely-owned rows will be imported into
2318  // 'Aview.importMatrix', and views of them will be included in 'Aview'.
2319  Aview.deleteContents();
2320 
2321  Aview.origMatrix = rcp(&A, false);
2322  Aview.origRowMap = A.getRowMap();
2323  Aview.rowMap = targetMap;
2324  Aview.colMap = A.getColMap();
2325  Aview.domainMap = A.getDomainMap();
2326  Aview.importColMap = null;
2327 
2328  // Short circuit if the user swears there are no remotes
2329  if (userAssertsThereAreNoRemotes)
2330  return;
2331 
2332  RCP<const import_type> importer;
2333  if (params != null && params->isParameter("importer")) {
2334  importer = params->get<RCP<const import_type> >("importer");
2335 
2336  } else {
2337 #ifdef HAVE_TPETRA_MMM_TIMINGS
2338  MM = rcp(new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string("MMM I&X RemoteMap"))));
2339 #endif
2340 
2341  // Mark each row in targetMap as local or remote, and go ahead and get a view
2342  // for the local rows
2343  RCP<const map_type> rowMap = A.getRowMap(), remoteRowMap;
2344  size_t numRemote = 0;
2345  int mode = 0;
2346  if (!prototypeImporter.is_null() &&
2347  prototypeImporter->getSourceMap()->isSameAs(*rowMap) &&
2348  prototypeImporter->getTargetMap()->isSameAs(*targetMap)) {
2349  // We have a valid prototype importer --- ask it for the remotes
2350  ArrayView<const LO> remoteLIDs = prototypeImporter->getRemoteLIDs();
2351  numRemote = prototypeImporter->getNumRemoteIDs();
2352 
2353  Array<GO> remoteRows(numRemote);
2354  for (size_t i = 0; i < numRemote; i++)
2355  remoteRows[i] = targetMap->getGlobalElement(remoteLIDs[i]);
2356 
2357  remoteRowMap = rcp(new map_type(Teuchos::OrdinalTraits<global_size_t>::invalid(), remoteRows(),
2358  rowMap->getIndexBase(), rowMap->getComm(), rowMap->getNode()));
2359  mode = 1;
2360 
2361  } else if (prototypeImporter.is_null()) {
2362  // No prototype importer --- count the remotes the hard way
2363  ArrayView<const GO> rows = targetMap->getNodeElementList();
2364  size_t numRows = targetMap->getNodeNumElements();
2365 
2366  Array<GO> remoteRows(numRows);
2367  for(size_t i = 0; i < numRows; ++i) {
2368  const LO mlid = rowMap->getLocalElement(rows[i]);
2369 
2370  if (mlid == Teuchos::OrdinalTraits<LO>::invalid())
2371  remoteRows[numRemote++] = rows[i];
2372  }
2373  remoteRows.resize(numRemote);
2374  remoteRowMap = rcp(new map_type(Teuchos::OrdinalTraits<global_size_t>::invalid(), remoteRows(),
2375  rowMap->getIndexBase(), rowMap->getComm(), rowMap->getNode()));
2376  mode = 2;
2377 
2378  } else {
2379  // PrototypeImporter is bad. But if we're in serial that's OK.
2380  mode = 3;
2381  }
2382 
2383  const int numProcs = rowMap->getComm()->getSize();
2384  if (numProcs < 2) {
2385  TEUCHOS_TEST_FOR_EXCEPTION(numRemote > 0, std::runtime_error,
2386  "MatrixMatrix::import_and_extract_views ERROR, numProcs < 2 but attempting to import remote matrix rows.");
2387  // If only one processor we don't need to import any remote rows, so return.
2388  return;
2389  }
2390 
2391  //
2392  // Now we will import the needed remote rows of A, if the global maximum
2393  // value of numRemote is greater than 0.
2394  //
2395 #ifdef HAVE_TPETRA_MMM_TIMINGS
2396  MM = rcp(new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string("MMM I&X Collective-0"))));
2397 #endif
2398 
2399  global_size_t globalMaxNumRemote = 0;
2400  Teuchos::reduceAll(*(rowMap->getComm()), Teuchos::REDUCE_MAX, (global_size_t)numRemote, Teuchos::outArg(globalMaxNumRemote) );
2401 
2402  if (globalMaxNumRemote > 0) {
2403 #ifdef HAVE_TPETRA_MMM_TIMINGS
2404  MM = rcp(new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string("MMM I&X Import-2"))));
2405 #endif
2406  // Create an importer with target-map remoteRowMap and source-map rowMap.
2407  if (mode == 1)
2408  importer = prototypeImporter->createRemoteOnlyImport(remoteRowMap);
2409  else if (mode == 2)
2410  importer = rcp(new import_type(rowMap, remoteRowMap));
2411  else
2412  throw std::runtime_error("prototypeImporter->SourceMap() does not match A.getRowMap()!");
2413  }
2414 
2415  if (params != null)
2416  params->set("importer", importer);
2417  }
2418 
2419  if (importer != null) {
2420 #ifdef HAVE_TPETRA_MMM_TIMINGS
2421  MM = rcp(new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string("MMM I&X Import-3"))));
2422 #endif
2423 
2424  // Now create a new matrix into which we can import the remote rows of A that we need.
2425  Teuchos::ParameterList labelList;
2426  labelList.set("Timer Label", label);
2427  Aview.importMatrix = Tpetra::importAndFillCompleteCrsMatrix<crs_matrix_type>(rcpFromRef(A), *importer,
2428  A.getDomainMap(), importer->getTargetMap(), rcpFromRef(labelList));
2429 
2430 #ifdef HAVE_TPETRA_MMM_STATISTICS
2431  printMultiplicationStatistics(importer, label + std::string(" I&X MMM"));
2432 #endif
2433 
2434 
2435 #ifdef HAVE_TPETRA_MMM_TIMINGS
2436  MM = rcp(new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string("MMM I&X Import-4"))));
2437 #endif
2438 
2439  // Save the column map of the imported matrix, so that we can convert indices back to global for arithmetic later
2440  Aview.importColMap = Aview.importMatrix->getColMap();
2441  }
2442 }
2443 
2444 
2445 
2446 } //End namepsace MMdetails
2447 
2448 } //End namespace Tpetra
2449 //
2450 // Explicit instantiation macro
2451 //
2452 // Must be expanded from within the Tpetra namespace!
2453 //
2454 
2455 #define TPETRA_MATRIXMATRIX_INSTANT(SCALAR,LO,GO,NODE) \
2456  \
2457  template \
2458  void MatrixMatrix::Multiply( \
2459  const CrsMatrix< SCALAR , LO , GO , NODE >& A, \
2460  bool transposeA, \
2461  const CrsMatrix< SCALAR , LO , GO , NODE >& B, \
2462  bool transposeB, \
2463  CrsMatrix< SCALAR , LO , GO , NODE >& C, \
2464  bool call_FillComplete_on_result, \
2465  const std::string & label, \
2466  const Teuchos::RCP<Teuchos::ParameterList>& params); \
2467 \
2468 template \
2469  void MatrixMatrix::Jacobi( \
2470  SCALAR omega, \
2471  const Vector< SCALAR, LO, GO, NODE > & Dinv, \
2472  const CrsMatrix< SCALAR , LO , GO , NODE >& A, \
2473  const CrsMatrix< SCALAR , LO , GO , NODE >& B, \
2474  CrsMatrix< SCALAR , LO , GO , NODE >& C, \
2475  bool call_FillComplete_on_result, \
2476  const std::string & label, \
2477  const Teuchos::RCP<Teuchos::ParameterList>& params); \
2478 \
2479  template \
2480  void MatrixMatrix::Add( \
2481  const CrsMatrix< SCALAR , LO , GO , NODE >& A, \
2482  bool transposeA, \
2483  SCALAR scalarA, \
2484  const CrsMatrix< SCALAR , LO , GO , NODE >& B, \
2485  bool transposeB, \
2486  SCALAR scalarB, \
2487  Teuchos::RCP<CrsMatrix< SCALAR , LO , GO , NODE > > C); \
2488  \
2489  template \
2490  void MatrixMatrix::Add( \
2491  const CrsMatrix<SCALAR, LO, GO, NODE>& A, \
2492  bool transposeA, \
2493  SCALAR scalarA, \
2494  CrsMatrix<SCALAR, LO, GO, NODE>& B, \
2495  SCALAR scalarB ); \
2496  \
2497  template \
2498  Teuchos::RCP<CrsMatrix< SCALAR , LO , GO , NODE > > \
2499  MatrixMatrix::add (const SCALAR & alpha, \
2500  const bool transposeA, \
2501  const CrsMatrix< SCALAR , LO , GO , NODE >& A, \
2502  const SCALAR & beta, \
2503  const bool transposeB, \
2504  const CrsMatrix< SCALAR , LO , GO , NODE >& B, \
2505  const Teuchos::RCP<const Map< LO , GO , NODE > >& domainMap, \
2506  const Teuchos::RCP<const Map< LO , GO , NODE > >& rangeMap, \
2507  const Teuchos::RCP<Teuchos::ParameterList>& params); \
2508 \
2509 
2510 
2511 #endif // TPETRA_MATRIXMATRIX_DEF_HPP
ProfileType getProfileType() const
Returns true if the matrix was allocated with static data structures.
Communication plan for data redistribution from a uniquely-owned to a (possibly) multiply-owned distr...
Namespace Tpetra contains the class and methods constituting the Tpetra library.
size_t getNodeMaxNumRowEntries() const
Returns the maximum number of entries across all rows/columns on this node.
Sparse matrix that presents a row-oriented interface that lets users read or modify entries...
Teuchos::RCP< const RowGraph< LocalOrdinal, GlobalOrdinal, Node > > getGraph() const
This matrix&#39;s graph, as a RowGraph.
bool isFillActive() const
Whether the matrix is not fill complete.
void Jacobi(Scalar omega, const Vector< Scalar, LocalOrdinal, GlobalOrdinal, Node > &Dinv, const CrsMatrix< Scalar, LocalOrdinal, GlobalOrdinal, Node > &A, const CrsMatrix< Scalar, LocalOrdinal, GlobalOrdinal, Node > &B, CrsMatrix< Scalar, LocalOrdinal, GlobalOrdinal, Node > &C, bool call_FillComplete_on_result=true, const std::string &label=std::string(), const Teuchos::RCP< Teuchos::ParameterList > &params=Teuchos::null)
Teuchos::RCP< const map_type > getRowMap() const
The Map that describes the row distribution in this matrix.
void insertGlobalValues(const GlobalOrdinal globalRow, const Teuchos::ArrayView< const GlobalOrdinal > &cols, const Teuchos::ArrayView< const Scalar > &vals)
Insert one or more entries into the matrix, using global column indices.
size_t global_size_t
Global size_t object.
LocalOrdinal sumIntoGlobalValues(const GlobalOrdinal globalRow, const Teuchos::ArrayView< const GlobalOrdinal > &cols, const Teuchos::ArrayView< const Scalar > &vals, const bool atomic=useAtomicUpdatesByDefault)
Sum into one or more sparse matrix entries, using global indices.
void Add(const CrsMatrix< Scalar, LocalOrdinal, GlobalOrdinal, Node > &A, bool transposeA, Scalar scalarA, CrsMatrix< Scalar, LocalOrdinal, GlobalOrdinal, Node > &B, Scalar scalarB)
global_size_t getGlobalNumRows() const
Number of global elements in the row map of this matrix.
bool isFillComplete() const
Whether the matrix is fill complete.
Construct and (optionally) redistribute the explicitly stored transpose of a CrsMatrix.
void keyValueMerge(KeyInputIterType keyBeg1, KeyInputIterType keyEnd1, ValueInputIterType valBeg1, ValueInputIterType valEnd1, KeyInputIterType keyBeg2, KeyInputIterType keyEnd2, ValueInputIterType valBeg2, ValueInputIterType valEnd2, KeyOutputIterType keyOut, ValueOutputIterType valOut, BinaryFunction f)
Merge two sorted (by keys) sequences of unique (key,value) pairs by combining pairs with equal keys...
void scale(const Scalar &alpha)
Scale the matrix&#39;s values: this := alpha*this.
Utility functions for packing and unpacking sparse matrix entries.
bool isStaticGraph() const
Indicates that the graph is static, so that new entries cannot be added to this matrix.
void fillComplete(const Teuchos::RCP< const map_type > &domainMap, const Teuchos::RCP< const map_type > &rangeMap, const Teuchos::RCP< Teuchos::ParameterList > &params=Teuchos::null)
Tell the matrix that you are done changing its structure or values, and that you are ready to do comp...
void Multiply(const CrsMatrix< Scalar, LocalOrdinal, GlobalOrdinal, Node > &A, bool transposeA, const CrsMatrix< Scalar, LocalOrdinal, GlobalOrdinal, Node > &B, bool transposeB, CrsMatrix< Scalar, LocalOrdinal, GlobalOrdinal, Node > &C, bool call_FillComplete_on_result=true, const std::string &label=std::string(), const Teuchos::RCP< Teuchos::ParameterList > &params=Teuchos::null)
Sparse matrix-matrix multiply.
size_t getNodeNumRows() const
The number of matrix rows owned by the calling process.
Describes a parallel distribution of objects over processes.
Internal functions and macros designed for use with Tpetra::Import and Tpetra::Export objects...
A read-only, row-oriented interface to a sparse matrix.
A distributed dense vector.
Stand-alone utility functions and macros.
Struct that holds views of the contents of a CrsMatrix.
Teuchos::RCP< const map_type > getDomainMap() const
The domain Map of this matrix.
Teuchos::RCP< CrsMatrix< Scalar, LocalOrdinal, GlobalOrdinal, Node > > add(const Scalar &alpha, const bool transposeA, const CrsMatrix< Scalar, LocalOrdinal, GlobalOrdinal, Node > &A, const Scalar &beta, const bool transposeB, const CrsMatrix< Scalar, LocalOrdinal, GlobalOrdinal, Node > &B, const Teuchos::RCP< const Map< LocalOrdinal, GlobalOrdinal, Node > > &domainMap=Teuchos::null, const Teuchos::RCP< const Map< LocalOrdinal, GlobalOrdinal, Node > > &rangeMap=Teuchos::null, const Teuchos::RCP< Teuchos::ParameterList > &params=Teuchos::null)
Compute the sparse matrix sum C = scalarA * Op(A) + scalarB * Op(B), where Op(X) is either X or its t...
Teuchos::RCP< const map_type > getRangeMap() const
The range Map of this matrix.
Teuchos::RCP< const Teuchos::Comm< int > > getComm() const
The communicator over which the matrix is distributed.
bool isLocallyIndexed() const
Whether the matrix is locally indexed on the calling process.