Zoltan2
Zoltan2_TaskMapping.hpp
Go to the documentation of this file.
1 #ifndef _ZOLTAN2_COORD_PARTITIONMAPPING_HPP_
2 #define _ZOLTAN2_COORD_PARTITIONMAPPING_HPP_
3 
4 #include <fstream>
5 #include <ctime>
6 #include <vector>
7 
9 #include "Teuchos_ArrayViewDecl.hpp"
12 #include "Teuchos_ReductionOp.hpp"
13 
15 
16 #include "Zoltan2_GraphModel.hpp"
17 #include <zoltan_dd.h>
18 #include <Zoltan2_TPLTraits.hpp>
19 
20 #include "Teuchos_Comm.hpp"
21 #ifdef HAVE_ZOLTAN2_MPI
22 #include "Teuchos_DefaultMpiComm.hpp"
23 #endif // HAVE_ZOLTAN2_MPI
24 #include <Teuchos_DefaultSerialComm.hpp>
25 
26 //#define gnuPlot
28 
29 namespace Teuchos{
30 
33 template <typename Ordinal, typename T>
34 class Zoltan2_ReduceBestMapping : public ValueTypeReductionOp<Ordinal,T>
35 {
36 private:
37  T _EPSILON;
38 
39 public:
42  Zoltan2_ReduceBestMapping ():_EPSILON (std::numeric_limits<T>::epsilon()){}
43 
46  void reduce( const Ordinal count, const T inBuffer[], T inoutBuffer[]) const
47  {
48 
49  for (Ordinal i=0; i < count; i++){
50  if (inBuffer[0] - inoutBuffer[0] < -_EPSILON){
51  inoutBuffer[0] = inBuffer[0];
52  inoutBuffer[1] = inBuffer[1];
53  } else if(inBuffer[0] - inoutBuffer[0] < _EPSILON &&
54  inBuffer[1] - inoutBuffer[1] < _EPSILON){
55  inoutBuffer[0] = inBuffer[0];
56  inoutBuffer[1] = inBuffer[1];
57  }
58  }
59  }
60 };
61 } // namespace Teuchos
62 
63 
64 namespace Zoltan2{
65 
66 template <typename it>
67 inline it z2Fact(it x) {
68  return (x == 1 ? x : x * z2Fact<it>(x - 1));
69 }
70 
71 template <typename gno_t, typename part_t>
73 public:
74  gno_t gno;
75  part_t part;
76 };
77 
78 //returns the ith permutation indices.
79 template <typename IT>
80 void ithPermutation(const IT n, IT i, IT *perm)
81 {
82  IT j, k = 0;
83  IT *fact = new IT[n];
84 
85 
86  // compute factorial numbers
87  fact[k] = 1;
88  while (++k < n)
89  fact[k] = fact[k - 1] * k;
90 
91  // compute factorial code
92  for (k = 0; k < n; ++k)
93  {
94  perm[k] = i / fact[n - 1 - k];
95  i = i % fact[n - 1 - k];
96  }
97 
98  // readjust values to obtain the permutation
99  // start from the end and check if preceding values are lower
100  for (k = n - 1; k > 0; --k)
101  for (j = k - 1; j >= 0; --j)
102  if (perm[j] <= perm[k])
103  perm[k]++;
104 
105  delete [] fact;
106 }
107 
108 template <typename part_t>
109 void getGridCommunicationGraph(part_t taskCount, part_t *&task_comm_xadj, part_t *&task_comm_adj, std::vector <int> grid_dims){
110  int dim = grid_dims.size();
111  int neighborCount = 2 * dim;
112  task_comm_xadj = allocMemory<part_t>(taskCount+1);
113  task_comm_adj = allocMemory<part_t>(taskCount * neighborCount);
114 
115  part_t neighBorIndex = 0;
116  task_comm_xadj[0] = 0;
117  for (part_t i = 0; i < taskCount; ++i){
118  part_t prevDimMul = 1;
119  for (int j = 0; j < dim; ++j){
120  part_t lNeighbor = i - prevDimMul;
121  part_t rNeighbor = i + prevDimMul;
122  prevDimMul *= grid_dims[j];
123  if (lNeighbor >= 0 && lNeighbor/ prevDimMul == i / prevDimMul && lNeighbor < taskCount){
124  task_comm_adj[neighBorIndex++] = lNeighbor;
125  }
126  if (rNeighbor >= 0 && rNeighbor/ prevDimMul == i / prevDimMul && rNeighbor < taskCount){
127  task_comm_adj[neighBorIndex++] = rNeighbor;
128  }
129  }
130  task_comm_xadj[i+1] = neighBorIndex;
131  }
132 
133 }
134 //returns the center of the parts.
135 template <typename Adapter, typename scalar_t, typename part_t>
137  const Environment *envConst,
138  const Teuchos::Comm<int> *comm,
140  //const Zoltan2::PartitioningSolution<Adapter> *soln_,
141  const part_t *parts,
142  int coordDim,
143  part_t ntasks,
144  scalar_t **partCenters){
145 
146  typedef typename Adapter::lno_t lno_t;
147  typedef typename Adapter::gno_t gno_t;
148 
149  typedef StridedData<lno_t, scalar_t> input_t;
150  ArrayView<const gno_t> gnos;
151  ArrayView<input_t> xyz;
152  ArrayView<input_t> wgts;
153  coords->getCoordinates(gnos, xyz, wgts);
154 
155  //local and global num coordinates.
156  lno_t numLocalCoords = coords->getLocalNumCoordinates();
157  //gno_t numGlobalCoords = coords->getGlobalNumCoordinates();
158 
159 
160 
161  //local number of points in each part.
162  gno_t *point_counts = allocMemory<gno_t>(ntasks);
163  memset(point_counts, 0, sizeof(gno_t) * ntasks);
164 
165  //global number of points in each part.
166  gno_t *global_point_counts = allocMemory<gno_t>(ntasks);
167 
168 
169  scalar_t **multiJagged_coordinates = allocMemory<scalar_t *>(coordDim);
170 
171  for (int dim=0; dim < coordDim; dim++){
172  ArrayRCP<const scalar_t> ar;
173  xyz[dim].getInputArray(ar);
174  //multiJagged coordinate values assignment
175  multiJagged_coordinates[dim] = (scalar_t *)ar.getRawPtr();
176  memset(partCenters[dim], 0, sizeof(scalar_t) * ntasks);
177  }
178 
179  //get parts with parallel gnos.
180  //const part_t *parts = soln_->getPartListView();
181  /*
182  for (lno_t i=0; i < numLocalCoords; i++){
183  cout << "me:" << comm->getRank() << " gno:" << soln_gnos[i] << " tmp.part :" << parts[i]<< endl;
184  }
185  */
186 
187 
188  envConst->timerStart(MACRO_TIMERS, "Mapping - Center Calculation");
189 
190 
191  for (lno_t i=0; i < numLocalCoords; i++){
192  gno_t g = gnos[i];
193  part_t p = parts[i];
194  //add up all coordinates in each part.
195  for(int j = 0; j < coordDim; ++j){
196  scalar_t c = multiJagged_coordinates[j][i];
197  partCenters[j][p] += c;
198  }
199  ++point_counts[p];
200  }
201 
202  //get global number of points in each part.
203  reduceAll<int, gno_t>(*comm, Teuchos::REDUCE_SUM,
204  ntasks, point_counts, global_point_counts
205  );
206 
207  for(int j = 0; j < coordDim; ++j){
208  for (part_t i=0; i < ntasks; ++i){
209  partCenters[j][i] /= global_point_counts[i];
210  }
211  }
212 
213  scalar_t *tmpCoords = allocMemory<scalar_t>(ntasks);
214  for(int j = 0; j < coordDim; ++j){
215  reduceAll<int, scalar_t>(*comm, Teuchos::REDUCE_SUM,
216  ntasks, partCenters[j], tmpCoords
217  );
218 
219  scalar_t *tmp = partCenters[j];
220  partCenters[j] = tmpCoords;
221  tmpCoords = tmp;
222  }
223  envConst->timerStop(MACRO_TIMERS, "Mapping - Center Calculation");
224 
225  freeArray<gno_t> (point_counts);
226  freeArray<gno_t> (global_point_counts);
227 
228  freeArray<scalar_t> (tmpCoords);
229  freeArray<scalar_t *>(multiJagged_coordinates);
230 }
231 
232 //returns the coarsend part graph.
233 template <typename Adapter, typename scalar_t, typename part_t>
235  const Environment *envConst,
236  const Teuchos::Comm<int> *comm,
238  //const Zoltan2::PartitioningSolution<Adapter> *soln_,
239  part_t np,
240  const part_t *parts,
241  ArrayRCP<part_t> &g_part_xadj,
242  ArrayRCP<part_t> &g_part_adj,
243  ArrayRCP<scalar_t> &g_part_ew
244  ){
245 
246  typedef typename Adapter::lno_t t_lno_t;
247  typedef typename Adapter::gno_t t_gno_t;
248  typedef typename Adapter::scalar_t t_scalar_t;
250 
251  //int numRanks = comm->getSize();
252  //int myRank = comm->getRank();
253 
254  //get parts with parallel gnos.
255  /*
256  const part_t *parts = soln_->getPartListView();
257 
258  part_t np = soln_->getActualGlobalNumberOfParts();
259  if (part_t (soln_->getTargetGlobalNumberOfParts()) > np){
260  np = soln_->getTargetGlobalNumberOfParts();
261  }
262  */
263 
264 
265  t_lno_t localNumVertices = graph->getLocalNumVertices();
266  t_gno_t globalNumVertices = graph->getGlobalNumVertices();
267  t_lno_t localNumEdges = graph->getLocalNumEdges();
268 
269  //get the vertex global ids, and weights
270  ArrayView<const t_gno_t> Ids;
271  ArrayView<t_input_t> v_wghts;
272  graph->getVertexList(Ids, v_wghts);
273 
274  //get the edge ids, and weights
275  ArrayView<const t_gno_t> edgeIds;
276  ArrayView<const t_lno_t> offsets;
277  ArrayView<t_input_t> e_wgts;
278  graph->getEdgeList(edgeIds, offsets, e_wgts);
279 
280 
281  std::vector <t_scalar_t> edge_weights;
282  int numWeightPerEdge = graph->getNumWeightsPerEdge();
283 
284  if (numWeightPerEdge > 0){
285  edge_weights = std::vector <t_scalar_t> (localNumEdges);
286  for (t_lno_t i = 0; i < localNumEdges; ++i){
287  edge_weights[i] = e_wgts[0][i];
288  }
289  }
290 
291  //create a zoltan dictionary to get the parts of the vertices
292  //at the other end of edges
293  std::vector <part_t> e_parts (localNumEdges);
294 #ifdef HAVE_ZOLTAN2_MPI
295  if (comm->getSize() > 1)
296  {
297  Zoltan_DD_Struct *dd = NULL;
298 
299  MPI_Comm mpicomm = Teuchos::getRawMpiComm(*comm);
301 
302  int debug_level = 0;
303  Zoltan_DD_Create(&dd, mpicomm,
304  size_gnot, 0,
305  sizeof(part_t), localNumVertices, debug_level);
306 
307  ZOLTAN_ID_PTR ddnotneeded = NULL; // Local IDs not needed
308  Zoltan_DD_Update(
309  dd,
310  (ZOLTAN_ID_PTR) Ids.getRawPtr(),
311  ddnotneeded,
312  (char *) parts,
313  NULL,
314  int(localNumVertices));
315 
316  Zoltan_DD_Find(
317  dd,
318  (ZOLTAN_ID_PTR) edgeIds.getRawPtr(),
319  ddnotneeded,
320  (char *)&(e_parts[0]),
321  NULL,
322  localNumEdges,
323  NULL
324  );
325  Zoltan_DD_Destroy(&dd);
326  } else
327 #endif
328  {
329 
330  /*
331  std::cout << "localNumVertices:" << localNumVertices
332  << " np:" << np
333  << " globalNumVertices:" << globalNumVertices
334  << " localNumEdges:" << localNumEdges << std::endl;
335  */
336 
337  for (t_lno_t i = 0; i < localNumEdges; ++i){
338  t_gno_t ei = edgeIds[i];
339  part_t p = parts[ei];
340  e_parts[i] = p;
341  }
342 
343  //get the vertices in each part in my part.
344  std::vector <t_lno_t> part_begins(np, -1);
345  std::vector <t_lno_t> part_nexts(localNumVertices, -1);
346 
347  //cluster vertices according to their parts.
348  //create local part graph.
349  for (t_lno_t i = 0; i < localNumVertices; ++i){
350  part_t ap = parts[i];
351  part_nexts[i] = part_begins[ap];
352  part_begins[ap] = i;
353  }
354 
355 
356  g_part_xadj = ArrayRCP<part_t> (np + 1);
357  g_part_adj = ArrayRCP<part_t> (localNumEdges);
358  g_part_ew = ArrayRCP<t_scalar_t> (localNumEdges);
359  part_t nindex = 0;
360  g_part_xadj[0] = 0;
361  std::vector <part_t> part_neighbors (np);
362  std::vector <t_scalar_t> part_neighbor_weights(np, 0);
363  std::vector <t_scalar_t> part_neighbor_weights_ordered(np);
364 
365  //coarsen for all vertices in my part in order with parts.
366  for (t_lno_t i = 0; i < np; ++i){
367  part_t num_neighbor_parts = 0;
368  t_lno_t v = part_begins[i];
369  //get part i, and first vertex in this part v.
370  while (v != -1){
371  //now get the neightbors of v.
372  for (t_lno_t j = offsets[v]; j < offsets[v+1]; ++j){
373  //get the part of the second vertex.
374  part_t ep = e_parts[j];
375 
376  t_scalar_t ew = 1;
377  if (numWeightPerEdge > 0){
378  ew = edge_weights[j];
379  }
380  //std::cout << "part:" << i << " v:" << v << " part2:" << ep << " v2:" << edgeIds[j] << " w:" << ew << std::endl;
381  //add it to my local part neighbors for part i.
382  if (part_neighbor_weights[ep] < 0.00001){
383  part_neighbors[num_neighbor_parts++] = ep;
384  }
385  part_neighbor_weights[ep] += ew;
386  }
387  v = part_nexts[v];
388  }
389 
390 
391  //now get the part list.
392  for (t_lno_t j = 0; j < num_neighbor_parts; ++j){
393  part_t neighbor_part = part_neighbors[j];
394  g_part_adj[nindex] = neighbor_part;
395  g_part_ew[nindex++] = part_neighbor_weights[neighbor_part];
396  part_neighbor_weights[neighbor_part] = 0;
397  }
398  g_part_xadj[i + 1] = nindex;
399  }
400  return;
401  }
402 
403  RCP<const Teuchos::Comm<int> > tcomm = rcpFromRef(*comm);
404  typedef Tpetra::Map<>::node_type t_node_t;
405  typedef Tpetra::Map<part_t, part_t, t_node_t> map_t;
406  Teuchos::RCP<const map_t> map = Teuchos::rcp (new map_t (np, 0, tcomm));
407  typedef Tpetra::CrsMatrix<t_scalar_t, part_t, part_t, t_node_t> tcrsMatrix_t;
408  Teuchos::RCP<tcrsMatrix_t> tMatrix(new tcrsMatrix_t (map, 0));
409 
410 
411 
412  envConst->timerStart(MACRO_TIMERS, "GRAPHCREATE Coarsen");
413  {
414  //get the vertices in each part in my part.
415  std::vector <t_lno_t> part_begins(np, -1);
416  std::vector <t_lno_t> part_nexts(localNumVertices, -1);
417 
418  //cluster vertices according to their parts.
419  //create local part graph.
420  for (t_lno_t i = 0; i < localNumVertices; ++i){
421  part_t ap = parts[i];
422  part_nexts[i] = part_begins[ap];
423  part_begins[ap] = i;
424  }
425 
426  std::vector <part_t> part_neighbors (np);
427  std::vector <t_scalar_t> part_neighbor_weights(np, 0);
428  std::vector <t_scalar_t> part_neighbor_weights_ordered(np);
429 
430  //coarsen for all vertices in my part in order with parts.
431  for (t_lno_t i = 0; i < np; ++i){
432  part_t num_neighbor_parts = 0;
433  t_lno_t v = part_begins[i];
434  //get part i, and first vertex in this part v.
435  while (v != -1){
436  //now get the neightbors of v.
437  for (t_lno_t j = offsets[v]; j < offsets[v+1]; ++j){
438  //get the part of the second vertex.
439  part_t ep = e_parts[j];
440 
441  t_scalar_t ew = 1;
442  if (numWeightPerEdge > 0){
443  ew = edge_weights[j];
444  }
445  //add it to my local part neighbors for part i.
446  if (part_neighbor_weights[ep] < 0.00001){
447  part_neighbors[num_neighbor_parts++] = ep;
448  }
449  part_neighbor_weights[ep] += ew;
450  }
451  v = part_nexts[v];
452  }
453 
454  //now get the part list.
455  for (t_lno_t j = 0; j < num_neighbor_parts; ++j){
456  part_t neighbor_part = part_neighbors[j];
457  part_neighbor_weights_ordered[j] = part_neighbor_weights[neighbor_part];
458  part_neighbor_weights[neighbor_part] = 0;
459  }
460 
461  //insert it to tpetra crsmatrix.
462  if (num_neighbor_parts > 0){
463  Teuchos::ArrayView<const part_t> destinations(
464  &(part_neighbors[0]), num_neighbor_parts);
465  Teuchos::ArrayView<const t_scalar_t>
466  vals(&(part_neighbor_weights_ordered[0]), num_neighbor_parts);
467  tMatrix->insertGlobalValues (i,destinations, vals);
468  }
469  }
470  }
471  envConst->timerStop(MACRO_TIMERS, "GRAPHCREATE Coarsen");
472  envConst->timerStart(MACRO_TIMERS, "GRAPHCREATE fillComplete");
473  tMatrix->fillComplete ();
474  envConst->timerStop(MACRO_TIMERS, "GRAPHCREATE fillComplete");
475 
476 
477  std::vector <part_t> part_indices(np);
478 
479  for (part_t i = 0; i < np; ++i) part_indices[i] = i;
480 
481  Teuchos::ArrayView<const part_t>
482  global_ids( &(part_indices[0]), np);
483 
484  //create a map where all processors own all rows.
485  //so that we do a gatherAll for crsMatrix.
486  Teuchos::RCP<const map_t> gatherRowMap(new map_t (
487  Teuchos::OrdinalTraits<Tpetra::global_size_t>::invalid(), global_ids, 0, tcomm));
488 
489  envConst->timerStart(MACRO_TIMERS, "GRAPHCREATE Import");
490  //create the importer for gatherAll
491  Teuchos::RCP<tcrsMatrix_t> A_gather =
492  Teuchos::rcp (new tcrsMatrix_t (gatherRowMap, 0));
493  typedef Tpetra::Import<typename map_t::local_ordinal_type,
494  typename map_t::global_ordinal_type,
495  typename map_t::node_type> import_type;
496  import_type import (map, gatherRowMap);
497  A_gather->doImport (*tMatrix, import, Tpetra::INSERT);
498  A_gather->fillComplete ();
499  envConst->timerStop(MACRO_TIMERS, "GRAPHCREATE Import");
500 
501  //create the output part arrays.
502  //all processors owns whole copy.
503  g_part_xadj = ArrayRCP<part_t> (np + 1);
504  g_part_adj = ArrayRCP<part_t> (A_gather->getNodeNumEntries ());
505  g_part_ew = ArrayRCP<t_scalar_t> (A_gather->getNodeNumEntries ());
506 
507  part_t *taskidx = g_part_xadj.getRawPtr();
508  part_t *taskadj = g_part_adj.getRawPtr();
509  t_scalar_t *taskadjwgt = g_part_ew.getRawPtr();
510 
511  taskidx[0] = 0;
512 
513  envConst->timerStart(MACRO_TIMERS, "GRAPHCREATE Import Copy");
514  for (part_t i = 0; i < np; i++) {
515  part_t length = A_gather->getNumEntriesInLocalRow(i); // Use Global to get same
516  size_t nentries;
517  taskidx[i+1] = taskidx[i] + length;
518  //get the indices
519  Teuchos::ArrayView<part_t> Indices(taskadj + taskidx[i], length);
520  Teuchos::ArrayView<t_scalar_t> Values(taskadjwgt + taskidx[i], length);
521  A_gather->getLocalRowCopy(i, Indices, Values, nentries);
522  }
523  envConst->timerStop(MACRO_TIMERS, "GRAPHCREATE Import Copy");
524 }
525 
526 
529 template <class IT, class WT>
531  IT heapSize;
532  IT *indices;
533  WT *values;
534  WT _EPSILON;
535 
536 
537 public:
538  void setHeapsize(IT heapsize_){
539  this->heapSize = heapsize_;
540  this->indices = allocMemory<IT>(heapsize_ );
541  this->values = allocMemory<WT>(heapsize_ );
542  this->_EPSILON = std::numeric_limits<WT>::epsilon();
543  }
544 
546  freeArray<IT>(this->indices);
547  freeArray<WT>(this->values);
548  }
549 
550 
551  void addPoint(IT index, WT distance){
552  WT maxVal = this->values[0];
553  //add only the distance is smaller than the maximum distance.
554  //cout << "indeX:" << index << "distance:" <<distance << " maxVal:" << maxVal << endl;
555  if (distance >= maxVal) return;
556  else {
557  this->values[0] = distance;
558  this->indices[0] = index;
559  this->push_down(0);
560  }
561  }
562 
563  //heap push down operation
564  void push_down(IT index_on_heap){
565  IT child_index1 = 2 * index_on_heap + 1;
566  IT child_index2 = 2 * index_on_heap + 2;
567 
568  IT biggerIndex = -1;
569  if(child_index1 < this->heapSize && child_index2 < this->heapSize){
570 
571  if (this->values[child_index1] < this->values[child_index2]){
572  biggerIndex = child_index2;
573  }
574  else {
575  biggerIndex = child_index1;
576  }
577  }
578  else if(child_index1 < this->heapSize){
579  biggerIndex = child_index1;
580 
581  }
582  else if(child_index2 < this->heapSize){
583  biggerIndex = child_index2;
584  }
585  if (biggerIndex >= 0 && this->values[biggerIndex] > this->values[index_on_heap]){
586  WT tmpVal = this->values[biggerIndex];
587  this->values[biggerIndex] = this->values[index_on_heap];
588  this->values[index_on_heap] = tmpVal;
589 
590  IT tmpIndex = this->indices[biggerIndex];
591  this->indices[biggerIndex] = this->indices[index_on_heap];
592  this->indices[index_on_heap] = tmpIndex;
593  this->push_down(biggerIndex);
594  }
595  }
596 
597  void initValues(){
598  WT MAXVAL = std::numeric_limits<WT>::max();
599  for(IT i = 0; i < this->heapSize; ++i){
600  this->values[i] = MAXVAL;
601  this->indices[i] = -1;
602  }
603  }
604 
605  //returns the total distance to center in the cluster.
607 
608  WT nc = 0;
609  for(IT j = 0; j < this->heapSize; ++j){
610  nc += this->values[j];
611 
612  //cout << "index:" << this->indices[j] << " distance:" << this->values[j] << endl;
613  }
614  return nc;
615  }
616 
617  //returns the new center of the cluster.
618  bool getNewCenters(WT *center, WT **coords, int dimension){
619  bool moved = false;
620  for(int i = 0; i < dimension; ++i){
621  WT nc = 0;
622  for(IT j = 0; j < this->heapSize; ++j){
623  IT k = this->indices[j];
624  //cout << "i:" << i << " dim:" << dimension << " k:" << k << " heapSize:" << heapSize << endl;
625  nc += coords[i][k];
626  }
627  nc /= this->heapSize;
628  moved = (ZOLTAN2_ABS(center[i] - nc) > this->_EPSILON || moved );
629  center[i] = nc;
630 
631  }
632  return moved;
633  }
634 
635  void copyCoordinates(IT *permutation){
636  for(IT i = 0; i < this->heapSize; ++i){
637  permutation[i] = this->indices[i];
638  }
639  }
640 };
641 
644 template <class IT, class WT>
646 
647  int dimension;
648  KmeansHeap<IT,WT> closestPoints;
649 
650 public:
651  WT *center;
653  freeArray<WT>(center);
654  }
655 
656  void setParams(int dimension_, int heapsize){
657  this->dimension = dimension_;
658  this->center = allocMemory<WT>(dimension_);
659  this->closestPoints.setHeapsize(heapsize);
660  }
661 
662  void clearHeap(){
663  this->closestPoints.initValues();
664  }
665 
666  bool getNewCenters( WT **coords){
667  return this->closestPoints.getNewCenters(center, coords, dimension);
668  }
669 
670  //returns the distance of the coordinate to the center.
671  //also adds it to the heap.
672  WT getDistance(IT index, WT **elementCoords){
673  WT distance = 0;
674  for (int i = 0; i < this->dimension; ++i){
675  WT d = (center[i] - elementCoords[i][index]);
676  distance += d * d;
677  }
678  distance = pow(distance, WT(1.0 / this->dimension));
679  closestPoints.addPoint(index, distance);
680  return distance;
681  }
682 
684  return closestPoints.getTotalDistance();
685  }
686 
687  void copyCoordinates(IT *permutation){
688  closestPoints.copyCoordinates(permutation);
689  }
690 };
691 
695 template <class IT, class WT>
697 
698  int dim;
699  IT numElements;
700  WT **elementCoords;
701  IT numClusters;
702  IT required_elements;
703  KMeansCluster <IT,WT> *clusters;
704  WT *maxCoordinates;
705  WT *minCoordinates;
706 public:
708  freeArray<KMeansCluster <IT,WT> >(clusters);
709  freeArray<WT>(maxCoordinates);
710  freeArray<WT>(minCoordinates);
711  }
712 
716  int dim_ ,
717  IT numElements_,
718  WT **elementCoords_,
719  IT required_elements_):
720  dim(dim_),
721  numElements(numElements_),
722  elementCoords(elementCoords_),
723  numClusters ((1 << dim_) + 1),
724  required_elements(required_elements_)
725  {
726  this->clusters = allocMemory<KMeansCluster <IT,WT> >(this->numClusters);
727  //set dimension and the number of required elements for all clusters.
728  for (int i = 0; i < numClusters; ++i){
729  this->clusters[i].setParams(this->dim, this->required_elements);
730  }
731 
732  this->maxCoordinates = allocMemory <WT> (this->dim);
733  this->minCoordinates = allocMemory <WT> (this->dim);
734 
735  //obtain the min and max coordiantes for each dimension.
736  for (int j = 0; j < dim; ++j){
737  this->minCoordinates[j] = this->maxCoordinates[j] = this->elementCoords[j][0];
738  for(IT i = 1; i < numElements; ++i){
739  WT t = this->elementCoords[j][i];
740  if(t > this->maxCoordinates[j]){
741  this->maxCoordinates[j] = t;
742  }
743  if (t < minCoordinates[j]){
744  this->minCoordinates[j] = t;
745  }
746  }
747  }
748 
749 
750  //assign initial cluster centers.
751  for (int j = 0; j < dim; ++j){
752  int mod = (1 << (j+1));
753  for (int i = 0; i < numClusters - 1; ++i){
754  WT c = 0;
755  if ( (i % mod) < mod / 2){
756  c = this->maxCoordinates[j];
757  //cout << "i:" << i << " j:" << j << " setting max:" << c << endl;
758  }
759  else {
760  c = this->minCoordinates[j];
761  }
762  this->clusters[i].center[j] = c;
763  }
764  }
765 
766  //last cluster center is placed to middle.
767  for (int j = 0; j < dim; ++j){
768  this->clusters[numClusters - 1].center[j] = (this->maxCoordinates[j] + this->minCoordinates[j]) / 2;
769  }
770 
771 
772  /*
773  for (int i = 0; i < numClusters; ++i){
774  //cout << endl << "cluster:" << i << endl << "\t";
775  for (int j = 0; j < dim; ++j){
776  cout << this->clusters[i].center[j] << " ";
777  }
778  }
779  */
780  }
781 
782  //performs kmeans clustering of coordinates.
783  void kmeans(){
784  for(int it = 0; it < 10; ++it){
785  //cout << "it:" << it << endl;
786  for (IT j = 0; j < this->numClusters; ++j){
787  this->clusters[j].clearHeap();
788  }
789  for (IT i = 0; i < this->numElements; ++i){
790  //cout << "i:" << i << " numEl:" << this->numElements << endl;
791  for (IT j = 0; j < this->numClusters; ++j){
792  //cout << "j:" << j << " numClusters:" << this->numClusters << endl;
793  this->clusters[j].getDistance(i,this->elementCoords);
794  }
795  }
796  bool moved = false;
797  for (IT j = 0; j < this->numClusters; ++j){
798  moved =(this->clusters[j].getNewCenters(this->elementCoords) || moved );
799  }
800  if (!moved){
801  break;
802  }
803  }
804 
805 
806  }
807 
808  //finds the cluster in which the coordinates are the closest to each other.
809  void getMinDistanceCluster(IT *procPermutation){
810 
811  WT minDistance = this->clusters[0].getDistanceToCenter();
812  IT minCluster = 0;
813  //cout << "j:" << 0 << " minDistance:" << minDistance << " minTmpDistance:" << minDistance<< " minCluster:" << minCluster << endl;
814  for (IT j = 1; j < this->numClusters; ++j){
815  WT minTmpDistance = this->clusters[j].getDistanceToCenter();
816  //cout << "j:" << j << " minDistance:" << minDistance << " minTmpDistance:" << minTmpDistance<< " minCluster:" << minCluster << endl;
817  if(minTmpDistance < minDistance){
818  minDistance = minTmpDistance;
819  minCluster = j;
820  }
821  }
822 
823  //cout << "minCluster:" << minCluster << endl;
824  this->clusters[minCluster].copyCoordinates(procPermutation);
825  }
826 };
827 
828 
829 
830 #define MINOF(a,b) (((a)<(b))?(a):(b))
831 
838 template <typename T>
839 void fillContinousArray(T *arr, size_t arrSize, T *val){
840  if(val == NULL){
841 
842 #ifdef HAVE_ZOLTAN2_OMP
843 #pragma omp parallel for
844 #endif
845  for(size_t i = 0; i < arrSize; ++i){
846  arr[i] = i;
847  }
848 
849  }
850  else {
851  T v = *val;
852 #ifdef HAVE_ZOLTAN2_OMP
853 #pragma omp parallel for
854 #endif
855  for(size_t i = 0; i < arrSize; ++i){
856  //cout << "writing to i:" << i << " arr:" << arrSize << endl;
857  arr[i] = v;
858  }
859  }
860 }
861 
864 template <typename part_t, typename pcoord_t>
866 protected:
867  double commCost;
868 public:
869 
870  part_t no_procs; //the number of processors
871  part_t no_tasks; //the number of taks.
873  CommunicationModel(part_t no_procs_, part_t no_tasks_):
874  commCost(),
875  no_procs(no_procs_),
876  no_tasks(no_tasks_){}
878  part_t getNProcs() const{
879  return this->no_procs;
880  }
881  part_t getNTasks()const{
882  return this->no_tasks;
883  }
884 
885 
887  part_t *task_to_proc,
888  part_t *task_communication_xadj,
889  part_t *task_communication_adj,
890  pcoord_t *task_communication_edge_weight){
891 
892  double totalCost = 0;
893 
894  part_t commCount = 0;
895  for (part_t task = 0; task < this->no_tasks; ++task){
896  int assigned_proc = task_to_proc[task];
897  //cout << "task:" << task << endl;
898  part_t task_adj_begin = task_communication_xadj[task];
899  part_t task_adj_end = task_communication_xadj[task+1];
900 
901  commCount += task_adj_end - task_adj_begin;
902  //cout << "task:" << task << " proc:" << assigned_proc << endl;
903  for (part_t task2 = task_adj_begin; task2 < task_adj_end; ++task2){
904 
905  //cout << "task2:" << task2 << endl;
906  part_t neighborTask = task_communication_adj[task2];
907  //cout << "neighborTask :" << neighborTask << endl;
908  int neighborProc = task_to_proc[neighborTask];
909 
910  double distance = getProcDistance(assigned_proc, neighborProc);
911 
912  if (task_communication_edge_weight == NULL){
913  totalCost += distance ;
914  }
915  else {
916  totalCost += distance * task_communication_edge_weight[task2];
917 
918  /*
919  std::cout << "\ttask:" << task << " assigned_proc:" << assigned_proc <<
920  "task2:" << task << " neighborProc:" << neighborProc <<
921  " d:" << distance << " task_communication_edge_weight[task2]:" << task_communication_edge_weight[task2] <<
922  " wh:" << distance * task_communication_edge_weight[task2] <<
923  std::endl;
924  */
925  }
926  }
927  }
928 
929  this->commCost = totalCost;// commCount;
930  }
931 
933  return this->commCost;
934  }
935 
936  virtual double getProcDistance(int procId1, int procId2) const = 0;
937 
944  virtual void getMapping(
945  int myRank,
946  const RCP<const Environment> &env,
947  ArrayRCP <part_t> &proc_to_task_xadj, // = allocMemory<part_t> (this->no_procs+1); //holds the pointer to the task array
948  ArrayRCP <part_t> &proc_to_task_adj, // = allocMemory<part_t>(this->no_tasks); //holds the indices of tasks wrt to proc_to_task_xadj array.
949  ArrayRCP <part_t> &task_to_proc //allocMemory<part_t>(this->no_tasks); //holds the processors mapped to tasks.
950  ) const = 0;
951 };
955 template <typename pcoord_t, typename tcoord_t, typename part_t>
956 class CoordinateCommunicationModel:public CommunicationModel<part_t, pcoord_t> {
957 public:
958  //private:
959  int proc_coord_dim; //dimension of the processors
960  pcoord_t **proc_coords; //the processor coordinates. allocated outside of the class.
961  int task_coord_dim; //dimension of the tasks coordinates.
962  tcoord_t **task_coords; //the task coordinates allocated outside of the class.
964  part_t *partNoArray;
965 
969 
970  //public:
972  CommunicationModel<part_t, pcoord_t>(),
973  proc_coord_dim(0),
974  proc_coords(0),
975  task_coord_dim(0),
976  task_coords(0),
977  partArraySize(-1),
978  partNoArray(NULL),
979  machine_extent(NULL),
981  machine(NULL){}
982 
984 
994  int pcoord_dim_,
995  pcoord_t **pcoords_,
996  int tcoord_dim_,
997  tcoord_t **tcoords_,
998  part_t no_procs_,
999  part_t no_tasks_,
1000  int *machine_extent_,
1001  bool *machine_extent_wrap_around_,
1002  const MachineRepresentation<pcoord_t,part_t> *machine_ = NULL
1003  ):
1004  CommunicationModel<part_t, pcoord_t>(no_procs_, no_tasks_),
1005  proc_coord_dim(pcoord_dim_), proc_coords(pcoords_),
1006  task_coord_dim(tcoord_dim_), task_coords(tcoords_),
1007  partArraySize(-1),
1008  partNoArray(NULL),
1009  machine_extent(machine_extent_),
1010  machine_extent_wrap_around(machine_extent_wrap_around_),
1011  machine(machine_){
1012  }
1013 
1014 
1015  void setPartArraySize(int psize){
1016  this->partArraySize = psize;
1017  }
1018  void setPartArray(part_t *pNo){
1019  this->partNoArray = pNo;
1020  }
1021 
1028  void getClosestSubset(part_t *proc_permutation, part_t nprocs, part_t ntasks) const{
1029  //currently returns a random subset.
1030 
1031  part_t minCoordDim = MINOF(this->task_coord_dim, this->proc_coord_dim);
1033  minCoordDim, nprocs,
1034  this->proc_coords, ntasks);
1035 
1036  kma.kmeans();
1037  kma.getMinDistanceCluster(proc_permutation);
1038 
1039  for(int i = ntasks; i < nprocs; ++i){
1040  proc_permutation[i] = -1;
1041  }
1042  /*
1043  //fill array.
1044  fillContinousArray<part_t>(proc_permutation, nprocs, NULL);
1045  int _u_umpa_seed = 847449649;
1046  srand (time(NULL));
1047  int a = rand() % 1000 + 1;
1048  _u_umpa_seed -= a;
1049  //permute array randomly.
1050  update_visit_order(proc_permutation, nprocs,_u_umpa_seed, 1);
1051  */
1052  }
1053 
1054  //temporary, necessary for random permutation.
1055  static part_t umpa_uRandom(part_t l, int &_u_umpa_seed)
1056  {
1057  int a = 16807;
1058  int m = 2147483647;
1059  int q = 127773;
1060  int r = 2836;
1061  int lo, hi, test;
1062  double d;
1063 
1064  lo = _u_umpa_seed % q;
1065  hi = _u_umpa_seed / q;
1066  test = (a*lo)-(r*hi);
1067  if (test>0)
1068  _u_umpa_seed = test;
1069  else
1070  _u_umpa_seed = test + m;
1071  d = (double) ((double) _u_umpa_seed / (double) m);
1072  return (part_t) (d*(double)l);
1073  }
1074 
1075  virtual double getProcDistance(int procId1, int procId2) const{
1076  pcoord_t distance = 0;
1077  if (machine == NULL){
1078  for (int i = 0 ; i < this->proc_coord_dim; ++i){
1079  double d = ZOLTAN2_ABS(proc_coords[i][procId1] - proc_coords[i][procId2]);
1081  if (machine_extent[i] - d < d){
1082  d = machine_extent[i] - d;
1083  }
1084  }
1085  distance += d;
1086  }
1087  }
1088  else {
1089  this->machine->getHopCount(procId1, procId2, distance);
1090  }
1091 
1092  return distance;
1093  }
1094 
1095 
1096  //temporary, does random permutation.
1097  void update_visit_order(part_t* visitOrder, part_t n, int &_u_umpa_seed, part_t rndm) {
1098  part_t *a = visitOrder;
1099 
1100 
1101  if (rndm){
1102  part_t i, u, v, tmp;
1103 
1104  if (n <= 4)
1105  return;
1106 
1107  //srand ( time(NULL) );
1108 
1109  //_u_umpa_seed = _u_umpa_seed1 - (rand()%100);
1110  for (i=0; i<n; i+=16)
1111  {
1112  u = umpa_uRandom(n-4, _u_umpa_seed);
1113  v = umpa_uRandom(n-4, _u_umpa_seed);
1114 
1115  // FIXME (mfh 30 Sep 2015) This requires including Zoltan2_AlgMultiJagged.hpp.
1116 
1117  ZOLTAN2_ALGMULTIJAGGED_SWAP(a[v], a[u], tmp);
1118  ZOLTAN2_ALGMULTIJAGGED_SWAP(a[v+1], a[u+1], tmp);
1119  ZOLTAN2_ALGMULTIJAGGED_SWAP(a[v+2], a[u+2], tmp);
1120  ZOLTAN2_ALGMULTIJAGGED_SWAP(a[v+3], a[u+3], tmp);
1121 
1122  }
1123  }
1124  else {
1125  part_t i, end = n / 4;
1126 
1127  for (i=1; i<end; i++)
1128  {
1129  part_t j=umpa_uRandom(n-i, _u_umpa_seed);
1130  part_t t=a[j];
1131  a[j] = a[n-i];
1132  a[n-i] = t;
1133  }
1134  }
1135  //PermuteInPlace(visitOrder, n);
1136  }
1137 
1138 
1139 
1146  virtual void getMapping(
1147  int myRank,
1148  const RCP<const Environment> &env,
1149  ArrayRCP <part_t> &rcp_proc_to_task_xadj, // = allocMemory<part_t> (this->no_procs+1); //holds the pointer to the task array
1150  ArrayRCP <part_t> &rcp_proc_to_task_adj, // = allocMemory<part_t>(this->no_tasks); //holds the indices of tasks wrt to proc_to_task_xadj array.
1151  ArrayRCP <part_t> &rcp_task_to_proc //allocMemory<part_t>(this->no_tasks); //holds the processors mapped to tasks.
1152  ) const{
1153 
1154  rcp_proc_to_task_xadj = ArrayRCP <part_t> (this->no_procs+1);
1155  rcp_proc_to_task_adj = ArrayRCP <part_t> (this->no_tasks);
1156  rcp_task_to_proc = ArrayRCP <part_t> (this->no_tasks);
1157 
1158  part_t *proc_to_task_xadj = rcp_proc_to_task_xadj.getRawPtr(); //holds the pointer to the task array
1159  part_t *proc_to_task_adj = rcp_proc_to_task_adj.getRawPtr(); //holds the indices of tasks wrt to proc_to_task_xadj array.
1160  part_t *task_to_proc = rcp_task_to_proc.getRawPtr(); //holds the processors mapped to tasks.);
1161 
1162 
1163  part_t invalid = 0;
1164  fillContinousArray<part_t> (proc_to_task_xadj, this->no_procs+1, &invalid);
1165 
1166  //obtain the number of parts that should be divided.
1167  part_t num_parts = MINOF(this->no_procs, this->no_tasks);
1168  //obtain the min coordinate dim.
1169  //No more want to do min coord dim. If machine dimension > task_dim,
1170  //we end up with a long line.
1171  //part_t minCoordDim = MINOF(this->task_coord_dim, this->proc_coord_dim);
1172 
1173  int recursion_depth = partArraySize;
1174  //if(partArraySize < minCoordDim) recursion_depth = minCoordDim;
1175  if (partArraySize == -1)
1176  recursion_depth = log(float(this->no_procs)) / log(2.0) + 1;
1177 
1178 
1179  int taskPerm = z2Fact<int>(this->task_coord_dim); //get the number of different permutations for task dimension ordering
1180  int procPerm = z2Fact<int>(this->proc_coord_dim); //get the number of different permutations for proc dimension ordering
1181 
1182  int permutations = taskPerm * procPerm; //total number of permutations
1183  //now add the ones, where we divide the processors with longest dimension,
1184  //but task with order.
1185  permutations += taskPerm;
1186  //and divide tasks with longest dimension, and processors with order.
1187  permutations += procPerm; //total number of permutations
1188  //and both with longest dimension.
1189  permutations += 1;
1190  //add one also that partitions based the longest dimension.
1191 
1192  //holds the pointers to proc_adjList
1193  part_t *proc_xadj = allocMemory<part_t> (num_parts+1);
1194  //holds the processors in parts according to the result of partitioning algorithm.
1195  //the processors assigned to part x is at proc_adjList[ proc_xadj[x] : proc_xadj[x+1] ]
1196  part_t *proc_adjList = allocMemory<part_t>(this->no_procs);
1197 
1198 
1199  part_t used_num_procs = this->no_procs;
1200  if(this->no_procs > this->no_tasks){
1201  //obtain the subset of the processors that are closest to each other.
1202  this->getClosestSubset(proc_adjList, this->no_procs, this->no_tasks);
1203  used_num_procs = this->no_tasks;
1204  }
1205  else {
1206  fillContinousArray<part_t>(proc_adjList,this->no_procs, NULL);
1207  }
1208 
1209  int myPermutation = myRank % permutations; //the index of the permutation
1210  bool task_partition_along_longest_dim = false;
1211  bool proc_partition_along_longest_dim = false;
1212 
1213 
1214  int myProcPerm = 0;
1215  int myTaskPerm = 0;
1216 
1217  if (myPermutation == 0){
1218  task_partition_along_longest_dim = true;
1219  proc_partition_along_longest_dim = true;
1220  }
1221  else {
1222  --myPermutation;
1223  if (myPermutation < taskPerm){
1224  proc_partition_along_longest_dim = true;
1225  myTaskPerm = myPermutation; // the index of the task permutation
1226  }
1227  else{
1228  myPermutation -= taskPerm;
1229  if (myPermutation < procPerm){
1230  task_partition_along_longest_dim = true;
1231  myProcPerm = myPermutation; // the index of the task permutation
1232  }
1233  else {
1234  myPermutation -= procPerm;
1235  myProcPerm = myPermutation % procPerm; // the index of the proc permutation
1236  myTaskPerm = myPermutation / procPerm; // the index of the task permutation
1237  }
1238  }
1239  }
1240 
1241 
1242 
1243 
1244  /*
1245  if (task_partition_along_longest_dim && proc_partition_along_longest_dim){
1246  std::cout <<"me:" << myRank << " task:longest proc:longest" << " numPerms:" << permutations << std::endl;
1247  }
1248  else if (proc_partition_along_longest_dim){
1249  std::cout <<"me:" << myRank << " task:" << myTaskPerm << " proc:longest" << " numPerms:" << permutations << std::endl;
1250  }
1251  else if (task_partition_along_longest_dim){
1252  std::cout <<"me:" << myRank << " task: longest" << " proc:" << myProcPerm << " numPerms:" << permutations << std::endl;
1253  }
1254  else {
1255  std::cout <<"me:" << myRank << " task:" << myTaskPerm << " proc:" << myProcPerm << " numPerms:" << permutations << std::endl;
1256  }
1257  */
1258 
1259 
1260 
1261  int *permutation = allocMemory<int> ((this->proc_coord_dim > this->task_coord_dim)
1262  ? this->proc_coord_dim : this->task_coord_dim);
1263 
1264  //get the permutation order from the proc permutation index.
1265  ithPermutation<int>(this->proc_coord_dim, myProcPerm, permutation);
1266  //reorder the coordinate dimensions.
1267  pcoord_t **pcoords = allocMemory<pcoord_t *> (this->proc_coord_dim);
1268  for(int i = 0; i < this->proc_coord_dim; ++i){
1269  pcoords[i] = this->proc_coords[permutation[i]];
1270  //cout << permutation[i] << " ";
1271  }
1272 
1273 
1274  //if (partNoArray == NULL) std::cout << "partNoArray is null" << std::endl;
1275  //std::cout << "recursion_depth:" << recursion_depth << " partArraySize:" << partArraySize << std::endl;
1276  //do the partitioning and renumber the parts.
1277  env->timerStart(MACRO_TIMERS, "Mapping - Proc Partitioning");
1278 
1280  mj_partitioner.sequential_task_partitioning(
1281  env,
1282  this->no_procs,
1283  used_num_procs,
1284  num_parts,
1285  this->proc_coord_dim,
1286  //minCoordDim,
1287  pcoords,//this->proc_coords,
1288  proc_adjList,
1289  proc_xadj,
1290  recursion_depth,
1291  partNoArray,
1292  proc_partition_along_longest_dim
1293  //,"proc_partitioning"
1294  );
1295  env->timerStop(MACRO_TIMERS, "Mapping - Proc Partitioning");
1296  freeArray<pcoord_t *> (pcoords);
1297 
1298 
1299  part_t *task_xadj = allocMemory<part_t> (num_parts+1);
1300  part_t *task_adjList = allocMemory<part_t>(this->no_tasks);
1301  //fill task_adjList st: task_adjList[i] <- i.
1302  fillContinousArray<part_t>(task_adjList,this->no_tasks, NULL);
1303 
1304  //get the permutation order from the task permutation index.
1305  ithPermutation<int>(this->task_coord_dim, myTaskPerm, permutation);
1306 
1307  //reorder task coordinate dimensions.
1308  tcoord_t **tcoords = allocMemory<tcoord_t *> (this->task_coord_dim);
1309  for(int i = 0; i < this->task_coord_dim; ++i){
1310  tcoords[i] = this->task_coords[permutation[i]];
1311  }
1312 
1313  env->timerStart(MACRO_TIMERS, "Mapping - Task Partitioning");
1314  //partitioning of tasks
1315  mj_partitioner.sequential_task_partitioning(
1316  env,
1317  this->no_tasks,
1318  this->no_tasks,
1319  num_parts,
1320  this->task_coord_dim,
1321  //minCoordDim,
1322  tcoords, //this->task_coords,
1323  task_adjList,
1324  task_xadj,
1325  recursion_depth,
1326  partNoArray,
1327  task_partition_along_longest_dim
1328  //,"task_partitioning"
1329  );
1330  env->timerStop(MACRO_TIMERS, "Mapping - Task Partitioning");
1331  freeArray<pcoord_t *> (tcoords);
1332  freeArray<int> (permutation);
1333 
1334 
1335  //filling proc_to_task_xadj, proc_to_task_adj, task_to_proc arrays.
1336  for(part_t i = 0; i < num_parts; ++i){
1337 
1338  part_t proc_index_begin = proc_xadj[i];
1339  part_t task_begin_index = task_xadj[i];
1340  part_t proc_index_end = proc_xadj[i+1];
1341  part_t task_end_index = task_xadj[i+1];
1342 
1343 
1344  if(proc_index_end - proc_index_begin != 1){
1345  std::cerr << "Error at partitioning of processors" << std::endl;
1346  std::cerr << "PART:" << i << " is assigned to " << proc_index_end - proc_index_begin << " processors." << std::endl;
1347  exit(1);
1348  }
1349  part_t assigned_proc = proc_adjList[proc_index_begin];
1350  proc_to_task_xadj[assigned_proc] = task_end_index - task_begin_index;
1351  }
1352 
1353 
1354  //holds the pointer to the task array
1355  //convert proc_to_task_xadj to CSR index array
1356  part_t *proc_to_task_xadj_work = allocMemory<part_t> (this->no_procs);
1357  part_t sum = 0;
1358  for(part_t i = 0; i < this->no_procs; ++i){
1359  part_t tmp = proc_to_task_xadj[i];
1360  proc_to_task_xadj[i] = sum;
1361  sum += tmp;
1362  proc_to_task_xadj_work[i] = sum;
1363  }
1364  proc_to_task_xadj[this->no_procs] = sum;
1365 
1366  for(part_t i = 0; i < num_parts; ++i){
1367 
1368  part_t proc_index_begin = proc_xadj[i];
1369  part_t task_begin_index = task_xadj[i];
1370  part_t task_end_index = task_xadj[i+1];
1371 
1372  part_t assigned_proc = proc_adjList[proc_index_begin];
1373 
1374  for (part_t j = task_begin_index; j < task_end_index; ++j){
1375  part_t taskId = task_adjList[j];
1376 
1377  task_to_proc[taskId] = assigned_proc;
1378 
1379  proc_to_task_adj [ --proc_to_task_xadj_work[assigned_proc] ] = taskId;
1380  }
1381  }
1382 
1383  freeArray<part_t>(proc_to_task_xadj_work);
1384  freeArray<part_t>(task_xadj);
1385  freeArray<part_t>(task_adjList);
1386  freeArray<part_t>(proc_xadj);
1387  freeArray<part_t>(proc_adjList);
1388  }
1389 
1390 };
1391 
1392 template <typename Adapter, typename part_t>
1394 protected:
1395 
1396 #ifndef DOXYGEN_SHOULD_SKIP_THIS
1397 
1398  typedef typename Adapter::scalar_t pcoord_t;
1399  typedef typename Adapter::scalar_t tcoord_t;
1400  typedef typename Adapter::scalar_t scalar_t;
1401 
1402 #endif
1403 
1404  //RCP<const Environment> env;
1405  ArrayRCP<part_t> proc_to_task_xadj; // = allocMemory<part_t> (this->no_procs+1); //holds the pointer to the task array
1406  ArrayRCP<part_t> proc_to_task_adj; // = allocMemory<part_t>(this->no_tasks); //holds the indices of tasks wrt to proc_to_task_xadj array.
1407  ArrayRCP<part_t> task_to_proc; //allocMemory<part_t>(this->no_procs); //holds the processors mapped to tasks.
1410  part_t nprocs;
1411  part_t ntasks;
1412  ArrayRCP<part_t> task_communication_xadj;
1413  ArrayRCP<part_t> task_communication_adj;
1415 
1416 
1419  void doMapping(int myRank){
1420 
1421  if(this->proc_task_comm){
1422  this->proc_task_comm->getMapping(
1423  myRank,
1424  this->env,
1425  this->proc_to_task_xadj, // = allocMemory<part_t> (this->no_procs+1); //holds the pointer to the task array
1426  this->proc_to_task_adj, // = allocMemory<part_t>(this->no_tasks); //holds the indices of tasks wrt to proc_to_task_xadj array.
1427  this->task_to_proc //allocMemory<part_t>(this->no_procs); //holds the processors mapped to tasks.);
1428  );
1429  }
1430  else {
1431  std::cerr << "communicationModel is not specified in the Mapper" << std::endl;
1432  exit(1);
1433  }
1434  }
1435 
1436 
1439  RCP<Comm<int> > create_subCommunicator(){
1440  int procDim = this->proc_task_comm->proc_coord_dim;
1441  int taskDim = this->proc_task_comm->task_coord_dim;
1442 
1443  int taskPerm = z2Fact<int>(procDim); //get the number of different permutations for task dimension ordering
1444  int procPerm = z2Fact<int>(taskDim); //get the number of different permutations for proc dimension ordering
1445  int idealGroupSize = taskPerm * procPerm; //total number of permutations
1446 
1447  idealGroupSize += taskPerm + procPerm + 1; //for the one that does longest dimension partitioning.
1448 
1449  int myRank = this->comm->getRank();
1450  int commSize = this->comm->getSize();
1451 
1452  int myGroupIndex = myRank / idealGroupSize;
1453 
1454  int prevGroupBegin = (myGroupIndex - 1)* idealGroupSize;
1455  if (prevGroupBegin < 0) prevGroupBegin = 0;
1456  int myGroupBegin = myGroupIndex * idealGroupSize;
1457  int myGroupEnd = (myGroupIndex + 1) * idealGroupSize;
1458  int nextGroupEnd = (myGroupIndex + 2)* idealGroupSize;
1459 
1460  if (myGroupEnd > commSize){
1461  myGroupBegin = prevGroupBegin;
1462  myGroupEnd = commSize;
1463  }
1464  if (nextGroupEnd > commSize){
1465  myGroupEnd = commSize;
1466  }
1467  int myGroupSize = myGroupEnd - myGroupBegin;
1468 
1469  part_t *myGroup = allocMemory<part_t>(myGroupSize);
1470  for (int i = 0; i < myGroupSize; ++i){
1471  myGroup[i] = myGroupBegin + i;
1472  }
1473  //cout << "me:" << myRank << " myGroupBegin:" << myGroupBegin << " myGroupEnd:" << myGroupEnd << endl;
1474 
1475  ArrayView<const part_t> myGroupView(myGroup, myGroupSize);
1476 
1477  RCP<Comm<int> > subComm = this->comm->createSubcommunicator(myGroupView);
1478  freeArray<part_t>(myGroup);
1479  return subComm;
1480  }
1481 
1482 
1486  //create the sub group.
1487  RCP<Comm<int> > subComm = this->create_subCommunicator();
1488  //calculate cost.
1489  double myCost = this->proc_task_comm->getCommunicationCostMetric();
1490  //std::cout << "me:" << this->comm->getRank() << " myCost:" << myCost << std::endl;
1491  double localCost[2], globalCost[2];
1492 
1493  localCost[0] = myCost;
1494  localCost[1] = double(subComm->getRank());
1495 
1496  globalCost[1] = globalCost[0] = std::numeric_limits<double>::max();
1498  reduceAll<int, double>(*subComm, reduceBest,
1499  2, localCost, globalCost);
1500 
1501  int sender = int(globalCost[1]);
1502 
1503  if ( this->comm->getRank() == 0){
1504  std::cout << "me:" << localCost[1] <<
1505  " localcost:" << localCost[0]<<
1506  " bestcost:" << globalCost[0] <<
1507  " Sender:" << sender <<
1508  " procDim" << proc_task_comm->proc_coord_dim <<
1509  " taskDim:" << proc_task_comm->task_coord_dim << std::endl;
1510  }
1511  //cout << "me:" << localCost[1] << " localcost:" << localCost[0]<< " bestcost:" << globalCost[0] << endl;
1512  //cout << "me:" << localCost[1] << " proc:" << globalCost[1] << endl;
1513  broadcast (*subComm, sender, this->ntasks, this->task_to_proc.getRawPtr());
1514  broadcast (*subComm, sender, this->nprocs, this->proc_to_task_xadj.getRawPtr());
1515  broadcast (*subComm, sender, this->ntasks, this->proc_to_task_adj.getRawPtr());
1516  }
1517 
1518 
1519 
1520  //write mapping to gnuPlot code to visualize.
1522  std::ofstream gnuPlotCode ("gnuPlot.plot", std::ofstream::out);
1523 
1524  int mindim = MINOF(proc_task_comm->proc_coord_dim, proc_task_comm->task_coord_dim);
1525  std::string ss = "";
1526  for(part_t i = 0; i < this->nprocs; ++i){
1527 
1528  std::string procFile = Teuchos::toString<int>(i) + "_mapping.txt";
1529  if (i == 0){
1530  gnuPlotCode << "plot \"" << procFile << "\"\n";
1531  }
1532  else {
1533  gnuPlotCode << "replot \"" << procFile << "\"\n";
1534  }
1535 
1536  std::ofstream inpFile (procFile.c_str(), std::ofstream::out);
1537 
1538  std::string gnuPlotArrow = "set arrow from ";
1539  for(int j = 0; j < mindim; ++j){
1540  if (j == mindim - 1){
1541  inpFile << proc_task_comm->proc_coords[j][i];
1542  gnuPlotArrow += Teuchos::toString<float>(proc_task_comm->proc_coords[j][i]);
1543 
1544  }
1545  else {
1546  inpFile << proc_task_comm->proc_coords[j][i] << " ";
1547  gnuPlotArrow += Teuchos::toString<float>(proc_task_comm->proc_coords[j][i]) +",";
1548  }
1549  }
1550  gnuPlotArrow += " to ";
1551 
1552 
1553  inpFile << std::endl;
1554  ArrayView<part_t> a = this->getAssignedTasksForProc(i);
1555  for(int k = 0; k < a.size(); ++k){
1556  int j = a[k];
1557  //cout << "i:" << i << " j:"
1558  std::string gnuPlotArrow2 = gnuPlotArrow;
1559  for(int z = 0; z < mindim; ++z){
1560  if(z == mindim - 1){
1561 
1562  //cout << "z:" << z << " j:" << j << " " << proc_task_comm->task_coords[z][j] << endl;
1563  inpFile << proc_task_comm->task_coords[z][j];
1564  gnuPlotArrow2 += Teuchos::toString<float>(proc_task_comm->task_coords[z][j]);
1565  }
1566  else{
1567  inpFile << proc_task_comm->task_coords[z][j] << " ";
1568  gnuPlotArrow2 += Teuchos::toString<float>(proc_task_comm->task_coords[z][j]) +",";
1569  }
1570  }
1571  ss += gnuPlotArrow2 + "\n";
1572  inpFile << std::endl;
1573  }
1574  inpFile.close();
1575 
1576  }
1577  gnuPlotCode << ss;
1578  gnuPlotCode << "\nreplot\n pause -1 \n";
1579  gnuPlotCode.close();
1580 
1581  }
1582 
1583 
1584  //write mapping to gnuPlot code to visualize.
1585  void writeMapping2(int myRank){
1586 
1587  std::string rankStr = Teuchos::toString<int>(myRank);
1588  std::string gnuPlots = "gnuPlot", extentionS = ".plot";
1589  std::string outF = gnuPlots + rankStr+ extentionS;
1590  std::ofstream gnuPlotCode ( outF.c_str(), std::ofstream::out);
1591 
1594  int mindim = MINOF(tmpproc_task_comm->proc_coord_dim, tmpproc_task_comm->task_coord_dim);
1595  std::string ss = "";
1596  std::string procs = "", parts = "";
1597  for(part_t i = 0; i < this->nprocs; ++i){
1598 
1599  //inpFile << std::endl;
1600  ArrayView<part_t> a = this->getAssignedTasksForProc(i);
1601  if (a.size() == 0){
1602  continue;
1603  }
1604 
1605  //std::ofstream inpFile (procFile.c_str(), std::ofstream::out);
1606 
1607  std::string gnuPlotArrow = "set arrow from ";
1608  for(int j = 0; j < mindim; ++j){
1609  if (j == mindim - 1){
1610  //inpFile << proc_task_comm->proc_coords[j][i];
1611  gnuPlotArrow += Teuchos::toString<float>(tmpproc_task_comm->proc_coords[j][i]);
1612  procs += Teuchos::toString<float>(tmpproc_task_comm->proc_coords[j][i]);
1613 
1614  }
1615  else {
1616  //inpFile << proc_task_comm->proc_coords[j][i] << " ";
1617  gnuPlotArrow += Teuchos::toString<float>(tmpproc_task_comm->proc_coords[j][i]) +",";
1618  procs += Teuchos::toString<float>(tmpproc_task_comm->proc_coords[j][i])+ " ";
1619  }
1620  }
1621  procs += "\n";
1622 
1623  gnuPlotArrow += " to ";
1624 
1625 
1626  for(int k = 0; k < a.size(); ++k){
1627  int j = a[k];
1628  //cout << "i:" << i << " j:"
1629  std::string gnuPlotArrow2 = gnuPlotArrow;
1630  for(int z = 0; z < mindim; ++z){
1631  if(z == mindim - 1){
1632 
1633  //cout << "z:" << z << " j:" << j << " " << proc_task_comm->task_coords[z][j] << endl;
1634  //inpFile << proc_task_comm->task_coords[z][j];
1635  gnuPlotArrow2 += Teuchos::toString<float>(tmpproc_task_comm->task_coords[z][j]);
1636  parts += Teuchos::toString<float>(tmpproc_task_comm->task_coords[z][j]);
1637  }
1638  else{
1639  //inpFile << proc_task_comm->task_coords[z][j] << " ";
1640  gnuPlotArrow2 += Teuchos::toString<float>(tmpproc_task_comm->task_coords[z][j]) +",";
1641  parts += Teuchos::toString<float>(tmpproc_task_comm->task_coords[z][j]) + " ";
1642  }
1643  }
1644  parts += "\n";
1645  ss += gnuPlotArrow2 + " nohead\n";
1646  //inpFile << std::endl;
1647  }
1648  //inpFile.close();
1649 
1650  }
1651 
1652 
1653  std::ofstream procFile ("procPlot.plot", std::ofstream::out);
1654  procFile << procs << "\n";
1655  procFile.close();
1656 
1657  std::ofstream partFile ("partPlot.plot", std::ofstream::out);
1658  partFile << parts<< "\n";
1659  partFile.close();
1660 
1661  std::ofstream extraProcFile ("allProc.plot", std::ofstream::out);
1662 
1663  for(part_t j = 0; j < this->nprocs; ++j){
1664  for(int i = 0; i < mindim; ++i){
1665  extraProcFile << tmpproc_task_comm->proc_coords[i][j] << " ";
1666  }
1667  extraProcFile << std::endl;
1668  }
1669 
1670  extraProcFile.close();
1671 
1672  gnuPlotCode << ss;
1673  if(mindim == 2){
1674  gnuPlotCode << "plot \"procPlot.plot\" with points pointsize 3\n";
1675  } else {
1676  gnuPlotCode << "splot \"procPlot.plot\" with points pointsize 3\n";
1677  }
1678  gnuPlotCode << "replot \"partPlot.plot\" with points pointsize 3\n";
1679  gnuPlotCode << "replot \"allProc.plot\" with points pointsize 0.65\n";
1680  gnuPlotCode << "\nreplot\n pause -1 \n";
1681  gnuPlotCode.close();
1682 
1683  }
1684 
1685 
1686 // KDD Need to provide access to algorithm for getPartBoxes
1687 #ifdef gnuPlot
1688  void writeGnuPlot(
1689  const Teuchos::Comm<int> *comm_,
1691  int coordDim,
1692  tcoord_t **partCenters
1693  ){
1694  std::string file = "gggnuPlot";
1695  std::string exten = ".plot";
1696  std::ofstream mm("2d.txt");
1697  file += Teuchos::toString<int>(comm_->getRank()) + exten;
1698  std::ofstream ff(file.c_str());
1699  //ff.seekg (0, ff.end);
1700  std::vector <Zoltan2::coordinateModelPartBox <tcoord_t, part_t> > outPartBoxes = ((Zoltan2::PartitioningSolution<Adapter> *)soln_)->getPartBoxesView();
1701 
1702  for (part_t i = 0; i < this->ntasks;++i){
1703  outPartBoxes[i].writeGnuPlot(ff, mm);
1704  }
1705  if (coordDim == 2){
1706  ff << "plot \"2d.txt\"" << std::endl;
1707  //ff << "\n pause -1" << endl;
1708  }
1709  else {
1710  ff << "splot \"2d.txt\"" << std::endl;
1711  //ff << "\n pause -1" << endl;
1712  }
1713  mm.close();
1714 
1715  ff << "set style arrow 5 nohead size screen 0.03,15,135 ls 1" << std::endl;
1716  for (part_t i = 0; i < this->ntasks;++i){
1717  part_t pb = task_communication_xadj[i];
1718  part_t pe = task_communication_xadj[i+1];
1719  for (part_t p = pb; p < pe; ++p){
1720  part_t n = task_communication_adj[p];
1721 
1722  //cout << "i:" << i << " n:" << n << endl;
1723  std::string arrowline = "set arrow from ";
1724  for (int j = 0; j < coordDim - 1; ++j){
1725  arrowline += Teuchos::toString<tcoord_t>(partCenters[j][n]) + ",";
1726  }
1727  arrowline += Teuchos::toString<tcoord_t>(partCenters[coordDim -1][n]) + " to ";
1728 
1729 
1730  for (int j = 0; j < coordDim - 1; ++j){
1731  arrowline += Teuchos::toString<tcoord_t>(partCenters[j][i]) + ",";
1732  }
1733  arrowline += Teuchos::toString<tcoord_t>(partCenters[coordDim -1][i]) + " as 5\n";
1734 
1735  //cout << "arrow:" << arrowline << endl;
1736  ff << arrowline;
1737  }
1738  }
1739 
1740  ff << "replot\n pause -1" << std::endl;
1741  ff.close();
1742  }
1743 #endif // gnuPlot
1744 
1745 public:
1746 
1747  void getProcTask(part_t* &proc_to_task_xadj_, part_t* &proc_to_task_adj_){
1748  proc_to_task_xadj_ = this->proc_to_task_xadj.getRawPtr();
1749  proc_to_task_adj_ = this->proc_to_task_adj.getRawPtr();
1750  }
1751 
1752  void map(RCP<MappingSolution<Adapter> > &mappingsoln) {
1753 
1754  // Mapping was already computed in the constructor; we need to store it
1755  // in the solution.
1756 
1757  mappingsoln->setMap_PartsForRank(this->proc_to_task_xadj,
1758  this->proc_to_task_adj);
1759 
1760  // KDDKDD TODO: Algorithm is also creating task_to_proc, which maybe
1761  // KDDKDD is not needed once we use MappingSolution to answer queries
1762  // KDDKDD instead of this algorithm.
1763  // KDDKDD Ask Mehmet: what is the most efficient way to get the answer
1764  // KDDKDD out of CoordinateTaskMapper and into the MappingSolution?
1765  }
1766 
1767 
1769  //freeArray<part_t> (proc_to_task_xadj);
1770  //freeArray<part_t> (proc_to_task_adj);
1771  //freeArray<part_t> (task_to_proc);
1772  if(this->isOwnerofModel){
1773  delete this->proc_task_comm;
1774  }
1775  }
1776 
1777 
1789  const Teuchos::RCP <const Teuchos::Comm<int> > comm_,
1790  const Teuchos::RCP <const MachineRepresentation<pcoord_t,part_t> > machine_,
1791  const Teuchos::RCP <const Adapter> input_adapter_,
1792  const Teuchos::RCP <const Zoltan2::PartitioningSolution<Adapter> > soln_,
1793  const Teuchos::RCP <const Environment> envConst):
1794  PartitionMapping<Adapter> (comm_, machine_, input_adapter_, soln_, envConst),
1795  proc_to_task_xadj(0),
1796  proc_to_task_adj(0),
1797  task_to_proc(0),
1798  isOwnerofModel(true),
1799  proc_task_comm(0),
1803  using namespace Teuchos;
1804  typedef typename Adapter::base_adapter_t ctm_base_adapter_t;
1805 
1806  RCP<Zoltan2::GraphModel<ctm_base_adapter_t> > graph_model_;
1807  RCP<Zoltan2::CoordinateModel<ctm_base_adapter_t> > coordinateModel_ ;
1808 
1809  RCP<const Teuchos::Comm<int> > rcp_comm = comm_;
1810  RCP<const Environment> envConst_ = envConst;
1811 
1812  RCP<const ctm_base_adapter_t> baseInputAdapter_ (
1813  rcp(dynamic_cast<const ctm_base_adapter_t *>(input_adapter_.getRawPtr()), false));
1814 
1815  modelFlag_t coordFlags_, graphFlags_;
1816 
1817  //create coordinate model
1818  //since this is coordinate task mapper,
1819  //the adapter has to have the coordinates
1820  coordinateModel_ = rcp(new CoordinateModel<ctm_base_adapter_t>(
1821  baseInputAdapter_, envConst_, rcp_comm, coordFlags_));
1822 
1823  //if the adapter has also graph model, we will use graph model
1824  //to calculate the cost mapping.
1825  BaseAdapterType inputType_ = input_adapter_->adapterType();
1826  if (inputType_ == MatrixAdapterType ||
1827  inputType_ == GraphAdapterType ||
1828  inputType_ == MeshAdapterType)
1829  {
1830  graph_model_ = rcp(new GraphModel<ctm_base_adapter_t>(
1831  baseInputAdapter_, envConst_, rcp_comm,
1832  graphFlags_));
1833  }
1834 
1835  if (!machine_->hasMachineCoordinates()) {
1836  throw std::runtime_error("Existing machine does not provide coordinates "
1837  "for coordinate task mapping");
1838  }
1839 
1840  //if mapping type is 0 then it is coordinate mapping
1841  int procDim = machine_->getMachineDim();
1842  this->nprocs = machine_->getNumRanks();
1843 
1844  //get processor coordinates.
1845  pcoord_t **procCoordinates = NULL;
1846  if (!machine_->getAllMachineCoordinatesView(procCoordinates)) {
1847  throw std::runtime_error("Existing machine does not implement "
1848  "getAllMachineCoordinatesView");
1849  }
1850 
1851  //get the machine extent.
1852  //if we have machine extent,
1853  //if the machine has wrap-around links, we would like to shift the coordinates,
1854  //so that the largest hap would be the wrap-around.
1855  std::vector <int> machine_extent_vec (procDim);
1856  //std::vector <bool> machine_extent_wrap_around_vec(procDim, 0);
1857  int *machine_extent = &(machine_extent_vec[0]);
1858  bool *machine_extent_wrap_around = new bool[procDim];
1859  machine_->getMachineExtentWrapArounds(machine_extent_wrap_around);
1860 
1861 
1862 
1863  // KDDKDD ASK MEHMET: SHOULD WE GET AND USE machine_dimension HERE IF IT
1864  // KDDKDD ASK MEHMET: IS PROVIDED BY THE MACHINE REPRESENTATION?
1865  // KDDKDD ASK MEHMET: IF NOT HERE, THEN WHERE?
1866  // MD: Yes, I ADDED BELOW:
1867  if (machine_->getMachineExtent(machine_extent)) {
1868  procCoordinates =
1869  this->shiftMachineCoordinates (
1870  procDim,
1871  machine_extent,
1872  machine_extent_wrap_around,
1873  this->nprocs,
1874  procCoordinates);
1875  }
1876 
1877 
1878  //get the tasks information, such as coordinate dimension,
1879  //number of parts.
1880  int coordDim = coordinateModel_->getCoordinateDim();
1881 
1882 
1883  this->ntasks = soln_->getActualGlobalNumberOfParts();
1884  if (part_t (soln_->getTargetGlobalNumberOfParts()) > this->ntasks){
1885  this->ntasks = soln_->getTargetGlobalNumberOfParts();
1886  }
1887  this->solution_parts = soln_->getPartListView();
1888 
1889  //we need to calculate the center of parts.
1890  tcoord_t **partCenters = NULL;
1891  partCenters = allocMemory<tcoord_t *>(coordDim);
1892  for (int i = 0; i < coordDim; ++i){
1893  partCenters[i] = allocMemory<tcoord_t>(this->ntasks);
1894  }
1895 
1896  typedef typename Adapter::scalar_t t_scalar_t;
1897 
1898 
1899  envConst->timerStart(MACRO_TIMERS, "Mapping - Solution Center");
1900 
1901  //get centers for the parts.
1902  getSolutionCenterCoordinates<Adapter, t_scalar_t,part_t>(
1903  envConst.getRawPtr(),
1904  comm_.getRawPtr(),
1905  coordinateModel_.getRawPtr(),
1906  this->solution_parts,
1907  //soln_->getPartListView();
1908  //this->soln.getRawPtr(),
1909  coordDim,
1910  ntasks,
1911  partCenters);
1912 
1913  envConst->timerStop(MACRO_TIMERS, "Mapping - Solution Center");
1914 
1915 
1916  //create the part graph
1917  if (graph_model_.getRawPtr() != NULL){
1918  getCoarsenedPartGraph<Adapter, t_scalar_t, part_t> (
1919  envConst.getRawPtr(),
1920  comm_.getRawPtr(),
1921  graph_model_.getRawPtr(),
1922  this->ntasks,
1923  this->solution_parts,
1924  //soln_->getPartListView(),
1925  //this->soln.getRawPtr(),
1929  );
1930  }
1931 
1932  //create coordinate communication model.
1933  this->proc_task_comm =
1935  procDim,
1936  procCoordinates,
1937  coordDim,
1938  partCenters,
1939  this->nprocs,
1940  this->ntasks,
1941  machine_extent,
1942  machine_extent_wrap_around,
1943  machine_.getRawPtr());
1944 
1945  int myRank = comm_->getRank();
1946 
1947 
1948  envConst->timerStart(MACRO_TIMERS, "Mapping - Processor Task map");
1949  this->doMapping(myRank);
1950  envConst->timerStop(MACRO_TIMERS, "Mapping - Processor Task map");
1951 
1952 
1953  envConst->timerStart(MACRO_TIMERS, "Mapping - Communication Graph");
1954 
1955  /*soln_->getCommunicationGraph(task_communication_xadj,
1956  task_communication_adj);
1957  */
1958 
1959  envConst->timerStop(MACRO_TIMERS, "Mapping - Communication Graph");
1960  #ifdef gnuPlot
1961  if (comm_->getRank() == 0){
1962 
1963  part_t taskCommCount = task_communication_xadj.size();
1964  std::cout << " TotalComm:" << task_communication_xadj[taskCommCount] << std::endl;
1965  part_t maxN = task_communication_xadj[0];
1966  for (part_t i = 1; i <= taskCommCount; ++i){
1967  part_t nc = task_communication_xadj[i] - task_communication_xadj[i-1];
1968  if (maxN < nc) maxN = nc;
1969  }
1970  std::cout << " maxNeighbor:" << maxN << std::endl;
1971  }
1972 
1973  this->writeGnuPlot(comm_, soln_, coordDim, partCenters);
1974  #endif
1975 
1976  envConst->timerStart(MACRO_TIMERS, "Mapping - Communication Cost");
1977 
1978  if (task_communication_xadj.getRawPtr() && task_communication_adj.getRawPtr()){
1979  this->proc_task_comm->calculateCommunicationCost(
1980  task_to_proc.getRawPtr(),
1981  task_communication_xadj.getRawPtr(),
1982  task_communication_adj.getRawPtr(),
1983  task_communication_edge_weight.getRawPtr()
1984  );
1985  }
1986 
1987  //std::cout << "me: " << comm_->getRank() << " cost:" << this->proc_task_comm->getCommunicationCostMetric() << std::endl;
1988 
1989  envConst->timerStop(MACRO_TIMERS, "Mapping - Communication Cost");
1990 
1991  //processors are divided into groups of size procDim! * coordDim!
1992  //each processor in the group obtains a mapping with a different rotation
1993  //and best one is broadcasted all processors.
1994  this->getBestMapping();
1995 
1996  /*
1997  {
1998  if (task_communication_xadj.getRawPtr() && task_communication_adj.getRawPtr())
1999  this->proc_task_comm->calculateCommunicationCost(
2000  task_to_proc.getRawPtr(),
2001  task_communication_xadj.getRawPtr(),
2002  task_communication_adj.getRawPtr(),
2003  task_communication_edge_weight.getRawPtr()
2004  );
2005  std::cout << "me: " << comm_->getRank() << " cost:" << this->proc_task_comm->getCommunicationCostMetric() << std::endl;
2006  }
2007  */
2008 
2009 
2010  #ifdef gnuPlot
2011  this->writeMapping2(comm_->getRank());
2012  #endif
2013 
2014  delete []machine_extent_wrap_around;
2015  if (machine_->getMachineExtent(machine_extent)){
2016  for (int i = 0; i < procDim; ++i){
2017  delete [] procCoordinates[i];
2018  }
2019  delete [] procCoordinates;
2020  }
2021 
2022  for (int i = 0; i < coordDim; ++i){
2023  freeArray<tcoord_t>(partCenters[i]);
2024  }
2025  freeArray<tcoord_t *>(partCenters);
2026 
2027  }
2028 
2029 
2041  const Teuchos::RCP <const Teuchos::Comm<int> > comm_,
2042  const Teuchos::RCP <const MachineRepresentation<pcoord_t,part_t> > machine_,
2043  const Teuchos::RCP <const Adapter> input_adapter_,
2044  const part_t num_parts_,
2045  const part_t *result_parts,
2046  const Teuchos::RCP <const Environment> envConst,
2047  bool is_input_adapter_distributed = true):
2048  PartitionMapping<Adapter> (comm_, machine_, input_adapter_, num_parts_, result_parts, envConst),
2049  proc_to_task_xadj(0),
2050  proc_to_task_adj(0),
2051  task_to_proc(0),
2052  isOwnerofModel(true),
2053  proc_task_comm(0),
2057  using namespace Teuchos;
2058  typedef typename Adapter::base_adapter_t ctm_base_adapter_t;
2059 
2060  RCP<Zoltan2::GraphModel<ctm_base_adapter_t> > graph_model_;
2061  RCP<Zoltan2::CoordinateModel<ctm_base_adapter_t> > coordinateModel_ ;
2062 
2063  RCP<const Teuchos::Comm<int> > rcp_comm = comm_;
2064  RCP<const Teuchos::Comm<int> > ia_comm = rcp_comm;
2065  if (!is_input_adapter_distributed){
2066  ia_comm = Teuchos::createSerialComm<int>();
2067  }
2068  RCP<const Environment> envConst_ = envConst;
2069 
2070  RCP<const ctm_base_adapter_t> baseInputAdapter_ (
2071  rcp(dynamic_cast<const ctm_base_adapter_t *>(input_adapter_.getRawPtr()), false));
2072 
2073  modelFlag_t coordFlags_, graphFlags_;
2074 
2075  //create coordinate model
2076  //since this is coordinate task mapper,
2077  //the adapter has to have the coordinates
2078  coordinateModel_ = rcp(new CoordinateModel<ctm_base_adapter_t>(
2079  baseInputAdapter_, envConst_, ia_comm, coordFlags_));
2080 
2081  //if the adapter has also graph model, we will use graph model
2082  //to calculate the cost mapping.
2083  BaseAdapterType inputType_ = input_adapter_->adapterType();
2084  if (inputType_ == MatrixAdapterType ||
2085  inputType_ == GraphAdapterType ||
2086  inputType_ == MeshAdapterType)
2087  {
2088  graph_model_ = rcp(new GraphModel<ctm_base_adapter_t>(
2089  baseInputAdapter_, envConst_, ia_comm,
2090  graphFlags_));
2091  }
2092 
2093  if (!machine_->hasMachineCoordinates()) {
2094  throw std::runtime_error("Existing machine does not provide coordinates "
2095  "for coordinate task mapping");
2096  }
2097 
2098  //if mapping type is 0 then it is coordinate mapping
2099  int procDim = machine_->getMachineDim();
2100  this->nprocs = machine_->getNumRanks();
2101 
2102  //get processor coordinates.
2103  pcoord_t **procCoordinates = NULL;
2104  if (!machine_->getAllMachineCoordinatesView(procCoordinates)) {
2105  throw std::runtime_error("Existing machine does not implement "
2106  "getAllMachineCoordinatesView");
2107  }
2108 
2109  //get the machine extent.
2110  //if we have machine extent,
2111  //if the machine has wrap-around links, we would like to shift the coordinates,
2112  //so that the largest hap would be the wrap-around.
2113  std::vector <int> machine_extent_vec (procDim);
2114  //std::vector <bool> machine_extent_wrap_around_vec(procDim, 0);
2115  int *machine_extent = &(machine_extent_vec[0]);
2116  bool *machine_extent_wrap_around = new bool[procDim];
2117  machine_->getMachineExtentWrapArounds(machine_extent_wrap_around);
2118 
2119 
2120 
2121  // KDDKDD ASK MEHMET: SHOULD WE GET AND USE machine_dimension HERE IF IT
2122  // KDDKDD ASK MEHMET: IS PROVIDED BY THE MACHINE REPRESENTATION?
2123  // KDDKDD ASK MEHMET: IF NOT HERE, THEN WHERE?
2124  // MD: Yes, I ADDED BELOW:
2125  if (machine_->getMachineExtent(machine_extent)) {
2126  procCoordinates =
2127  this->shiftMachineCoordinates (
2128  procDim,
2129  machine_extent,
2130  machine_extent_wrap_around,
2131  this->nprocs,
2132  procCoordinates);
2133  }
2134 
2135 
2136  //get the tasks information, such as coordinate dimension,
2137  //number of parts.
2138  int coordDim = coordinateModel_->getCoordinateDim();
2139 
2140 
2141  this->ntasks = num_parts_;
2142  this->solution_parts = result_parts;
2143 
2144  //we need to calculate the center of parts.
2145  tcoord_t **partCenters = NULL;
2146  partCenters = allocMemory<tcoord_t *>(coordDim);
2147  for (int i = 0; i < coordDim; ++i){
2148  partCenters[i] = allocMemory<tcoord_t>(this->ntasks);
2149  }
2150 
2151  typedef typename Adapter::scalar_t t_scalar_t;
2152 
2153 
2154  envConst->timerStart(MACRO_TIMERS, "Mapping - Solution Center");
2155 
2156  //get centers for the parts.
2157  getSolutionCenterCoordinates<Adapter, t_scalar_t,part_t>(
2158  envConst.getRawPtr(),
2159  ia_comm.getRawPtr(),
2160  coordinateModel_.getRawPtr(),
2161  this->solution_parts,
2162  //soln_->getPartListView();
2163  //this->soln.getRawPtr(),
2164  coordDim,
2165  ntasks,
2166  partCenters);
2167 
2168  envConst->timerStop(MACRO_TIMERS, "Mapping - Solution Center");
2169 
2170  envConst->timerStart(MACRO_TIMERS, "GRAPHCREATE");
2171  //create the part graph
2172  if (graph_model_.getRawPtr() != NULL){
2173  getCoarsenedPartGraph<Adapter, t_scalar_t, part_t> (
2174  envConst.getRawPtr(),
2175  ia_comm.getRawPtr(),
2176  graph_model_.getRawPtr(),
2177  this->ntasks,
2178  this->solution_parts,
2179  //soln_->getPartListView(),
2180  //this->soln.getRawPtr(),
2184  );
2185  }
2186  envConst->timerStop(MACRO_TIMERS, "GRAPHCREATE");
2187 
2188 
2189  envConst->timerStart(MACRO_TIMERS, "CoordinateCommunicationModel Create");
2190  //create coordinate communication model.
2191  this->proc_task_comm =
2193  procDim,
2194  procCoordinates,
2195  coordDim,
2196  partCenters,
2197  this->nprocs,
2198  this->ntasks,
2199  machine_extent,
2200  machine_extent_wrap_around,
2201  machine_.getRawPtr());
2202 
2203  envConst->timerStop(MACRO_TIMERS, "CoordinateCommunicationModel Create");
2204 
2205  int myRank = comm_->getRank();
2206 
2207 
2208  envConst->timerStart(MACRO_TIMERS, "Mapping - Processor Task map");
2209  this->doMapping(myRank);
2210  envConst->timerStop(MACRO_TIMERS, "Mapping - Processor Task map");
2211 
2212 
2213  envConst->timerStart(MACRO_TIMERS, "Mapping - Communication Graph");
2214 
2215  /*soln_->getCommunicationGraph(task_communication_xadj,
2216  task_communication_adj);
2217  */
2218 
2219  envConst->timerStop(MACRO_TIMERS, "Mapping - Communication Graph");
2220  #ifdef gnuPlot
2221  if (comm_->getRank() == 0){
2222 
2223  part_t taskCommCount = task_communication_xadj.size();
2224  std::cout << " TotalComm:" << task_communication_xadj[taskCommCount] << std::endl;
2225  part_t maxN = task_communication_xadj[0];
2226  for (part_t i = 1; i <= taskCommCount; ++i){
2227  part_t nc = task_communication_xadj[i] - task_communication_xadj[i-1];
2228  if (maxN < nc) maxN = nc;
2229  }
2230  std::cout << " maxNeighbor:" << maxN << std::endl;
2231  }
2232 
2233  this->writeGnuPlot(comm_, soln_, coordDim, partCenters);
2234  #endif
2235 
2236  envConst->timerStart(MACRO_TIMERS, "Mapping - Communication Cost");
2237 
2238  if (task_communication_xadj.getRawPtr() && task_communication_adj.getRawPtr()){
2239  this->proc_task_comm->calculateCommunicationCost(
2240  task_to_proc.getRawPtr(),
2241  task_communication_xadj.getRawPtr(),
2242  task_communication_adj.getRawPtr(),
2243  task_communication_edge_weight.getRawPtr()
2244  );
2245  }
2246 
2247  //std::cout << "me: " << comm_->getRank() << " cost:" << this->proc_task_comm->getCommunicationCostMetric() << std::endl;
2248 
2249  envConst->timerStop(MACRO_TIMERS, "Mapping - Communication Cost");
2250 
2251  //processors are divided into groups of size procDim! * coordDim!
2252  //each processor in the group obtains a mapping with a different rotation
2253  //and best one is broadcasted all processors.
2254  this->getBestMapping();
2255 
2256  /*
2257 
2258  {
2259  if (task_communication_xadj.getRawPtr() && task_communication_adj.getRawPtr())
2260  this->proc_task_comm->calculateCommunicationCost(
2261  task_to_proc.getRawPtr(),
2262  task_communication_xadj.getRawPtr(),
2263  task_communication_adj.getRawPtr(),
2264  task_communication_edge_weight.getRawPtr()
2265  );
2266  std::cout << "me: " << comm_->getRank() << " cost:" << this->proc_task_comm->getCommunicationCostMetric() << std::endl;
2267  }
2268  */
2269 
2270 
2271 
2272  #ifdef gnuPlot
2273  this->writeMapping2(comm_->getRank());
2274  #endif
2275 
2276  delete []machine_extent_wrap_around;
2277  if (machine_->getMachineExtent(machine_extent)){
2278  for (int i = 0; i < procDim; ++i){
2279  delete [] procCoordinates[i];
2280  }
2281  delete [] procCoordinates;
2282  }
2283 
2284  for (int i = 0; i < coordDim; ++i){
2285  freeArray<tcoord_t>(partCenters[i]);
2286  }
2287  freeArray<tcoord_t *>(partCenters);
2288 
2289  }
2290 
2338  const Environment *env_const_,
2339  const Teuchos::Comm<int> *problemComm,
2340  int proc_dim,
2341  int num_processors,
2342  pcoord_t **machine_coords,
2343 
2344  int task_dim,
2345  part_t num_tasks,
2346  tcoord_t **task_coords,
2347  ArrayRCP<part_t>task_comm_xadj,
2348  ArrayRCP<part_t>task_comm_adj,
2349  pcoord_t *task_communication_edge_weight_,
2350  int recursion_depth,
2351  part_t *part_no_array,
2352  const part_t *machine_dimensions
2353  ): PartitionMapping<Adapter>(
2354  Teuchos::rcpFromRef<const Teuchos::Comm<int> >(*problemComm),
2355  Teuchos::rcpFromRef<const Environment> (*env_const_)),
2356  proc_to_task_xadj(0),
2357  proc_to_task_adj(0),
2358  task_to_proc(0),
2359  isOwnerofModel(true),
2360  proc_task_comm(0),
2361  task_communication_xadj(task_comm_xadj),
2362  task_communication_adj(task_comm_adj){
2363 
2364  //if mapping type is 0 then it is coordinate mapping
2365  pcoord_t ** virtual_machine_coordinates = machine_coords;
2366  bool *wrap_arounds = new bool [proc_dim];
2367  for (int i = 0; i < proc_dim; ++i) wrap_arounds[i] = true;
2368 
2369  if (machine_dimensions){
2370  virtual_machine_coordinates =
2371  this->shiftMachineCoordinates (
2372  proc_dim,
2373  machine_dimensions,
2374  wrap_arounds,
2375  num_processors,
2376  machine_coords);
2377  }
2378 
2379  this->nprocs = num_processors;
2380 
2381  int coordDim = task_dim;
2382  this->ntasks = num_tasks;
2383 
2384  //alloc memory for part centers.
2385  tcoord_t **partCenters = task_coords;
2386 
2387  //create coordinate communication model.
2388  this->proc_task_comm =
2390  proc_dim,
2391  virtual_machine_coordinates,
2392  coordDim,
2393  partCenters,
2394  this->nprocs,
2395  this->ntasks, NULL, NULL
2396  );
2397  this->proc_task_comm->setPartArraySize(recursion_depth);
2398  this->proc_task_comm->setPartArray(part_no_array);
2399 
2400  int myRank = problemComm->getRank();
2401 
2402  this->doMapping(myRank);
2403 #ifdef gnuPlot
2404  this->writeMapping2(myRank);
2405 #endif
2406 
2407  if (task_communication_xadj.getRawPtr() && task_communication_adj.getRawPtr()){
2408  this->proc_task_comm->calculateCommunicationCost(
2409  task_to_proc.getRawPtr(),
2410  task_communication_xadj.getRawPtr(),
2411  task_communication_adj.getRawPtr(),
2412  task_communication_edge_weight_
2413  );
2414 
2415 
2416  this->getBestMapping();
2417 
2418 /*
2419  if (myRank == 0){
2420  this->proc_task_comm->calculateCommunicationCost(
2421  task_to_proc.getRawPtr(),
2422  task_communication_xadj.getRawPtr(),
2423  task_communication_adj.getRawPtr(),
2424  task_communication_edge_weight_
2425  );
2426  cout << "me: " << problemComm->getRank() << " cost:" << this->proc_task_comm->getCommunicationCostMetric() << endl;
2427  }
2428 */
2429 
2430  }
2431  delete [] wrap_arounds;
2432 
2433  if (machine_dimensions){
2434  for (int i = 0; i < proc_dim; ++i){
2435  delete [] virtual_machine_coordinates[i];
2436  }
2437  delete [] virtual_machine_coordinates;
2438  }
2439 #ifdef gnuPlot
2440  if(comm_->getRank() == 0)
2441  this->writeMapping2(-1);
2442 #endif
2443  }
2444 
2445 
2446  /*
2447  double getCommunicationCostMetric(){
2448  return this->proc_task_comm->getCommCost();
2449  }
2450  */
2451 
2454  virtual size_t getLocalNumberOfParts() const{
2455  return 0;
2456  }
2457 
2467  int machine_dim,
2468  const part_t *machine_dimensions,
2469  bool *machine_extent_wrap_around,
2470  part_t numProcs,
2471  pcoord_t **mCoords){
2472  pcoord_t **result_machine_coords = NULL;
2473  result_machine_coords = new pcoord_t*[machine_dim];
2474  for (int i = 0; i < machine_dim; ++i){
2475  result_machine_coords[i] = new pcoord_t [numProcs];
2476  }
2477 
2478  for (int i = 0; i < machine_dim; ++i){
2479  part_t numMachinesAlongDim = machine_dimensions[i];
2480  part_t *machineCounts= new part_t[numMachinesAlongDim];
2481  memset(machineCounts, 0, sizeof(part_t) *numMachinesAlongDim);
2482 
2483  int *filledCoordinates= new int[numMachinesAlongDim];
2484 
2485  pcoord_t *coords = mCoords[i];
2486  for(part_t j = 0; j < numProcs; ++j){
2487  part_t mc = (part_t) coords[j];
2488  ++machineCounts[mc];
2489  }
2490 
2491  part_t filledCoordinateCount = 0;
2492  for(part_t j = 0; j < numMachinesAlongDim; ++j){
2493  if (machineCounts[j] > 0){
2494  filledCoordinates[filledCoordinateCount++] = j;
2495  }
2496  }
2497 
2498  part_t firstProcCoord = filledCoordinates[0];
2499  part_t firstProcCount = machineCounts[firstProcCoord];
2500 
2501  part_t lastProcCoord = filledCoordinates[filledCoordinateCount - 1];
2502  part_t lastProcCount = machineCounts[lastProcCoord];
2503 
2504  part_t firstLastGap = numMachinesAlongDim - lastProcCoord + firstProcCoord;
2505  part_t firstLastGapProc = lastProcCount + firstProcCount;
2506 
2507  part_t leftSideProcCoord = firstProcCoord;
2508  part_t leftSideProcCount = firstProcCount;
2509  part_t biggestGap = 0;
2510  part_t biggestGapProc = numProcs;
2511 
2512  part_t shiftBorderCoordinate = -1;
2513  for(part_t j = 1; j < filledCoordinateCount; ++j){
2514  part_t rightSideProcCoord= filledCoordinates[j];
2515  part_t rightSideProcCount = machineCounts[rightSideProcCoord];
2516 
2517  part_t gap = rightSideProcCoord - leftSideProcCoord;
2518  part_t gapProc = rightSideProcCount + leftSideProcCount;
2519 
2520  /* Pick the largest gap in this dimension. Use fewer process on either side
2521  of the largest gap to break the tie. An easy addition to this would
2522  be to weight the gap by the number of processes. */
2523  if (gap > biggestGap || (gap == biggestGap && biggestGapProc > gapProc)){
2524  shiftBorderCoordinate = rightSideProcCoord;
2525  biggestGapProc = gapProc;
2526  biggestGap = gap;
2527  }
2528  leftSideProcCoord = rightSideProcCoord;
2529  leftSideProcCount = rightSideProcCount;
2530  }
2531 
2532 
2533  if (!(biggestGap > firstLastGap || (biggestGap == firstLastGap && biggestGapProc < firstLastGapProc))){
2534  shiftBorderCoordinate = -1;
2535  }
2536 
2537  for(part_t j = 0; j < numProcs; ++j){
2538 
2539  if (machine_extent_wrap_around[i] && coords[j] < shiftBorderCoordinate){
2540  result_machine_coords[i][j] = coords[j] + numMachinesAlongDim;
2541 
2542  }
2543  else {
2544  result_machine_coords[i][j] = coords[j];
2545  }
2546  //cout << "I:" << i << "j:" << j << " coord:" << coords[j] << " now:" << result_machine_coords[i][j] << endl;
2547  }
2548  delete [] machineCounts;
2549  delete [] filledCoordinates;
2550  }
2551 
2552  return result_machine_coords;
2553 
2554  }
2555 
2562  virtual void getProcsForPart(part_t taskId, part_t &numProcs, part_t *&procs) const{
2563  numProcs = 1;
2564  procs = this->task_to_proc.getRawPtr() + taskId;
2565  }
2569  inline part_t getAssignedProcForTask(part_t taskId){
2570  return this->task_to_proc[taskId];
2571  }
2578  virtual void getPartsForProc(int procId, part_t &numParts, part_t *&parts) const{
2579 
2580  part_t task_begin = this->proc_to_task_xadj[procId];
2581  part_t taskend = this->proc_to_task_xadj[procId+1];
2582  parts = this->proc_to_task_adj.getRawPtr() + task_begin;
2583  numParts = taskend - task_begin;
2584  }
2585 
2586  ArrayView<part_t> getAssignedTasksForProc(part_t procId){
2587  part_t task_begin = this->proc_to_task_xadj[procId];
2588  part_t taskend = this->proc_to_task_xadj[procId+1];
2589 
2590  /*
2591  cout << "part_t:" << procId << " taskCount:" << taskend - task_begin << endl;
2592  for(part_t i = task_begin; i < taskend; ++i){
2593  cout << "part_t:" << procId << " task:" << proc_to_task_adj[i] << endl;
2594  }
2595  */
2596  if (taskend - task_begin > 0){
2597  ArrayView <part_t> assignedParts(this->proc_to_task_adj.getRawPtr() + task_begin, taskend - task_begin);
2598  return assignedParts;
2599  }
2600  else {
2601  ArrayView <part_t> assignedParts;
2602  return assignedParts;
2603  }
2604  }
2605 
2606 };
2607 
2608 
2609 
2678 template <typename part_t, typename pcoord_t, typename tcoord_t>
2680  RCP<const Teuchos::Comm<int> > problemComm,
2681  int proc_dim,
2682  int num_processors,
2683  pcoord_t **machine_coords,
2684  int task_dim,
2685  part_t num_tasks,
2686  tcoord_t **task_coords,
2687  part_t *task_comm_xadj,
2688  part_t *task_comm_adj,
2689  pcoord_t *task_communication_edge_weight_, /*float-like, same size with task_communication_adj_ weight of the corresponding edge.*/
2690  part_t *proc_to_task_xadj, /*output*/
2691  part_t *proc_to_task_adj, /*output*/
2692  int recursion_depth,
2693  part_t *part_no_array,
2694  const part_t *machine_dimensions
2695 )
2696 {
2697 
2698  const Environment *envConst_ = new Environment();
2699 
2700  // mfh 03 Mar 2015: It's OK to omit the Node template
2701  // parameter in Tpetra, if you're just going to use the
2702  // default Node.
2703  typedef Tpetra::MultiVector<tcoord_t, part_t, part_t> tMVector_t;
2704 
2705  Teuchos::ArrayRCP<part_t> task_communication_xadj (task_comm_xadj, 0, num_tasks+1, false);
2706 
2707  Teuchos::ArrayRCP<part_t> task_communication_adj;
2708  if (task_comm_xadj){
2709  Teuchos::ArrayRCP<part_t> tmp_task_communication_adj (task_comm_adj, 0, task_comm_xadj[num_tasks], false);
2710  task_communication_adj = tmp_task_communication_adj;
2711  }
2712 
2713 
2716  envConst_,
2717  problemComm.getRawPtr(),
2718  proc_dim,
2719  num_processors,
2720  machine_coords,//machine_coords_,
2721 
2722  task_dim,
2723  num_tasks,
2724  task_coords,
2725 
2728  task_communication_edge_weight_,
2729  recursion_depth,
2730  part_no_array,
2731  machine_dimensions
2732  );
2733 
2734 
2735  part_t* proc_to_task_xadj_;
2736  part_t* proc_to_task_adj_;
2737 
2738  ctm->getProcTask(proc_to_task_xadj_, proc_to_task_adj_);
2739 
2740  for (part_t i = 0; i <= num_processors; ++i){
2741  proc_to_task_xadj[i] = proc_to_task_xadj_[i];
2742  }
2743  for (part_t i = 0; i < num_tasks; ++i){
2744  proc_to_task_adj[i] = proc_to_task_adj_[i];
2745  }
2746  delete ctm;
2747  delete envConst_;
2748 
2749 }
2750 
2751 
2752 }// namespace Zoltan2
2753 
2754 #endif
void setParams(int dimension_, int heapsize)
void map(RCP< MappingSolution< Adapter > > &mappingsoln)
CommunicationModel Base Class that performs mapping between the coordinate partitioning result...
Zoltan2::BaseAdapter< userTypes_t > base_adapter_t
void getBestMapping()
finds the lowest cost mapping and broadcasts solution to everyone.
Tpetra::CrsMatrix< zscalar_t, zlno_t, zgno_t, znode_t > tcrsMatrix_t
Definition: GraphModel.cpp:84
void ithPermutation(const IT n, IT i, IT *perm)
CommunicationModel(part_t no_procs_, part_t no_tasks_)
size_t getLocalNumCoordinates() const
Returns the number of coordinates on this process.
Time an algorithm (or other entity) as a whole.
void sequential_task_partitioning(const RCP< const Environment > &env, mj_lno_t num_total_coords, mj_lno_t num_selected_coords, size_t num_target_part, int coord_dim, mj_scalar_t **mj_coordinates, mj_lno_t *initial_selected_coords_output_permutation, mj_lno_t *output_xadj, int recursion_depth, const mj_part_t *part_no_array, bool partition_along_longest_dim)
Special function for partitioning for task mapping. Runs sequential, and performs deterministic parti...
WT getDistance(IT index, WT **elementCoords)
virtual size_t getLocalNumberOfParts() const
Returns the number of parts to be assigned to this process.
bool getHopCount(int rank1, int rank2, pcoord_t &hops) const
CoordinateCommunicationModel(int pcoord_dim_, pcoord_t **pcoords_, int tcoord_dim_, tcoord_t **tcoords_, part_t no_procs_, part_t no_tasks_, int *machine_extent_, bool *machine_extent_wrap_around_, const MachineRepresentation< pcoord_t, part_t > *machine_=NULL)
Class Constructor:
void getCoarsenedPartGraph(const Environment *envConst, const Teuchos::Comm< int > *comm, const Zoltan2::GraphModel< typename Adapter::base_adapter_t > *graph, part_t np, const part_t *parts, ArrayRCP< part_t > &g_part_xadj, ArrayRCP< part_t > &g_part_adj, ArrayRCP< scalar_t > &g_part_ew)
void getClosestSubset(part_t *proc_permutation, part_t nprocs, part_t ntasks) const
Function is called whenever nprocs > no_task. Function returns only the subset of processors that are...
PartitionMapping maps a solution or an input distribution to ranks.
void getSolutionCenterCoordinates(const Environment *envConst, const Teuchos::Comm< int > *comm, const Zoltan2::CoordinateModel< typename Adapter::base_adapter_t > *coords, const part_t *parts, int coordDim, part_t ntasks, scalar_t **partCenters)
size_t getLocalNumVertices() const
Returns the number vertices on this process.
bool getNewCenters(WT **coords)
void timerStop(TimerType tt, const char *timerName) const
Stop a named timer.
virtual double getProcDistance(int procId1, int procId2) const =0
std::bitset< NUM_MODEL_FLAGS > modelFlag_t
size_t getLocalNumEdges() const
Returns the number of edges on this process. In global or subset graphs, includes off-process edges...
void calculateCommunicationCost(part_t *task_to_proc, part_t *task_communication_xadj, part_t *task_communication_adj, pcoord_t *task_communication_edge_weight)
size_t getGlobalNumVertices() const
Returns the global number vertices.
ArrayView< part_t > getAssignedTasksForProc(part_t procId)
virtual void getMapping(int myRank, const RCP< const Environment > &env, ArrayRCP< part_t > &rcp_proc_to_task_xadj, ArrayRCP< part_t > &rcp_proc_to_task_adj, ArrayRCP< part_t > &rcp_task_to_proc) const
Function is called whenever nprocs > no_task. Function returns only the subset of processors that are...
const MachineRepresentation< pcoord_t, part_t > * machine
size_t getEdgeList(ArrayView< const gno_t > &edgeIds, ArrayView< const lno_t > &offsets, ArrayView< input_t > &wgts) const
Sets pointers to this process&#39; edge (neighbor) global Ids, including off-process edges.
part_t getAssignedProcForTask(part_t taskId)
getAssignedProcForTask function, returns the assigned processor id for the given task ...
void getMinDistanceCluster(IT *procPermutation)
virtual void getPartsForProc(int procId, part_t &numParts, part_t *&parts) const
getAssignedProcForTask function, returns the assigned tasks with the number of tasks.
void coordinateTaskMapperInterface(RCP< const Teuchos::Comm< int > > problemComm, int proc_dim, int num_processors, pcoord_t **machine_coords, int task_dim, part_t num_tasks, tcoord_t **task_coords, part_t *task_comm_xadj, part_t *task_comm_adj, pcoord_t *task_communication_edge_weight_, part_t *proc_to_task_xadj, part_t *proc_to_task_adj, int recursion_depth, part_t *part_no_array, const part_t *machine_dimensions)
Constructor The interface function that calls CoordinateTaskMapper which will also perform the mappin...
Defines the XpetraMultiVectorAdapter.
Multi Jagged coordinate partitioning algorithm.
virtual void getProcsForPart(part_t taskId, part_t &numProcs, part_t *&procs) const
getAssignedProcForTask function, returns the assigned tasks with the number of tasks.
This class provides geometric coordinates with optional weights to the Zoltan2 algorithm.
int getNumWeightsPerEdge() const
Returns the number (0 or greater) of weights per edge.
Contains the Multi-jagged algorthm.
PartitionMapping maps a solution or an input distribution to ranks.
void getProcTask(part_t *&proc_to_task_xadj_, part_t *&proc_to_task_adj_)
dictionary vals
Definition: xml2dox.py:186
A PartitioningSolution is a solution to a partitioning problem.
#define ZOLTAN2_ABS(x)
size_t getCoordinates(ArrayView< const gno_t > &Ids, ArrayView< input_t > &xyz, ArrayView< input_t > &wgts) const
Returns the coordinate ids, values and optional weights.
void copyCoordinates(IT *permutation)
void getGridCommunicationGraph(part_t taskCount, part_t *&task_comm_xadj, part_t *&task_comm_adj, std::vector< int > grid_dims)
Zoltan2_ReduceBestMapping Class, reduces the minimum cost mapping, ties breaks with minimum proc id...
KMeansCluster Class.
BaseAdapterType
An enum to identify general types of adapters.
The StridedData class manages lists of weights or coordinates.
The user parameters, debug, timing and memory profiling output objects, and error checking methods...
static part_t umpa_uRandom(part_t l, int &_u_umpa_seed)
void reduce(const Ordinal count, const T inBuffer[], T inoutBuffer[]) const
Implement Teuchos::ValueTypeReductionOp interface.
void addPoint(IT index, WT distance)
bool getNewCenters(WT *center, WT **coords, int dimension)
MachineRepresentation Class Base class for representing machine coordinates, networks, etc.
size_t getVertexList(ArrayView< const gno_t > &Ids, ArrayView< input_t > &wgts) const
Sets pointers to this process&#39; vertex Ids and their weights.
CoordinateTaskMapper(const Teuchos::RCP< const Teuchos::Comm< int > > comm_, const Teuchos::RCP< const MachineRepresentation< pcoord_t, part_t > > machine_, const Teuchos::RCP< const Adapter > input_adapter_, const Teuchos::RCP< const Zoltan2::PartitioningSolution< Adapter > > soln_, const Teuchos::RCP< const Environment > envConst)
Constructor. When this constructor is called, in order to calculate the communication metric...
CoordinateModelInput Class that performs mapping between the coordinate partitioning result and mpi r...
Traits class to handle conversions between gno_t/lno_t and TPL data types (e.g., ParMETIS&#39;s idx_t...
size_t getActualGlobalNumberOfParts() const
Returns the actual global number of parts provided in setParts().
pcoord_t ** shiftMachineCoordinates(int machine_dim, const part_t *machine_dimensions, bool *machine_extent_wrap_around, part_t numProcs, pcoord_t **mCoords)
Using the machine dimensions provided, create virtual machine coordinates by assigning the largest ga...
CoordinateTaskMapper(const Environment *env_const_, const Teuchos::Comm< int > *problemComm, int proc_dim, int num_processors, pcoord_t **machine_coords, int task_dim, part_t num_tasks, tcoord_t **task_coords, ArrayRCP< part_t >task_comm_xadj, ArrayRCP< part_t >task_comm_adj, pcoord_t *task_communication_edge_weight_, int recursion_depth, part_t *part_no_array, const part_t *machine_dimensions)
Constructor The mapping constructor which will also perform the mapping operation. The result mapping can be obtained by –getAssignedProcForTask function: which returns the assigned processor id for the given task –getPartsForProc: which returns the assigned tasks with the number of tasks.
void doMapping(int myRank)
doMapping function, calls getMapping function of communicationModel object.
Defines the MappingSolution class.
const Teuchos::RCP< const Teuchos::Comm< int > > comm
const part_t * getPartListView() const
Returns the part list corresponding to the global ID list.
GraphModel defines the interface required for graph models.
Tpetra::MultiVector< double, int, int > tMVector_t
void update_visit_order(part_t *visitOrder, part_t n, int &_u_umpa_seed, part_t rndm)
CoordinateTaskMapper(const Teuchos::RCP< const Teuchos::Comm< int > > comm_, const Teuchos::RCP< const MachineRepresentation< pcoord_t, part_t > > machine_, const Teuchos::RCP< const Adapter > input_adapter_, const part_t num_parts_, const part_t *result_parts, const Teuchos::RCP< const Environment > envConst, bool is_input_adapter_distributed=true)
Constructor. Instead of Solution we have two parameters, numparts When this constructor is called...
Zoltan2_ReduceBestMapping()
Default Constructor.
virtual void getMapping(int myRank, const RCP< const Environment > &env, ArrayRCP< part_t > &proc_to_task_xadj, ArrayRCP< part_t > &proc_to_task_adj, ArrayRCP< part_t > &task_to_proc) const =0
Function is called whenever nprocs > no_task. Function returns only the subset of processors that are...
size_t getTargetGlobalNumberOfParts() const
Returns the global number of parts desired in the solution.
void timerStart(TimerType tt, const char *timerName) const
Start a named timer.
KmeansHeap Class, max heap, but holds the minimum values.
Defines the GraphModel interface.
void fillContinousArray(T *arr, size_t arrSize, T *val)
fillContinousArray function
#define ZOLTAN2_ALGMULTIJAGGED_SWAP(a, b, temp)
void push_down(IT index_on_heap)
ArrayRCP< scalar_t > task_communication_edge_weight
#define epsilon
RCP< Comm< int > > create_subCommunicator()
creates and returns the subcommunicator for the processor group.
KMeansAlgorithm Class that performs clustering of the coordinates, and returns the closest set of coo...
const Teuchos::RCP< const Environment > env
KMeansAlgorithm(int dim_, IT numElements_, WT **elementCoords_, IT required_elements_)
KMeansAlgorithm Constructor.
virtual double getProcDistance(int procId1, int procId2) const
void copyCoordinates(IT *permutation)
#define MINOF(a, b)
void setHeapsize(IT heapsize_)
CoordinateCommunicationModel< pcoord_t, tcoord_t, part_t > * proc_task_comm