Zoltan2
Zoltan2_AlgMultiJagged.hpp
Go to the documentation of this file.
1 // @HEADER
2 //
3 // ***********************************************************************
4 //
5 // Zoltan2: A package of combinatorial algorithms for scientific computing
6 // Copyright 2012 Sandia Corporation
7 //
8 // Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
9 // the U.S. Government retains certain rights in this software.
10 //
11 // Redistribution and use in source and binary forms, with or without
12 // modification, are permitted provided that the following conditions are
13 // met:
14 //
15 // 1. Redistributions of source code must retain the above copyright
16 // notice, this list of conditions and the following disclaimer.
17 //
18 // 2. Redistributions in binary form must reproduce the above copyright
19 // notice, this list of conditions and the following disclaimer in the
20 // documentation and/or other materials provided with the distribution.
21 //
22 // 3. Neither the name of the Corporation nor the names of the
23 // contributors may be used to endorse or promote products derived from
24 // this software without specific prior written permission.
25 //
26 // THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
27 // EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
28 // IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
29 // PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
30 // CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
31 // EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
32 // PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
33 // PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
34 // LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
35 // NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
36 // SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
37 //
38 // Questions? Contact Karen Devine (kddevin@sandia.gov)
39 // Erik Boman (egboman@sandia.gov)
40 // Siva Rajamanickam (srajama@sandia.gov)
41 //
42 // ***********************************************************************
43 //
44 // @HEADER
49 #ifndef _ZOLTAN2_ALGMultiJagged_HPP_
50 #define _ZOLTAN2_ALGMultiJagged_HPP_
51 
54 #include <Zoltan2_Parameters.hpp>
55 #include <Zoltan2_Algorithm.hpp>
57 #include <Teuchos_StandardParameterEntryValidators.hpp>
58 
59 #include <Tpetra_Distributor.hpp>
60 #include <Teuchos_ParameterList.hpp>
62 #include <new> // ::operator new[]
63 #include <algorithm> // std::sort
64 #include <Zoltan2_Util.hpp>
65 #include <vector>
66 
67 #if defined(__cplusplus) && __cplusplus >= 201103L
68 #include <unordered_map>
69 #else
70 #include <Teuchos_Hashtable.hpp>
71 #endif // C++11 is enabled
72 
73 #ifdef ZOLTAN2_USEZOLTANCOMM
74 #ifdef HAVE_ZOLTAN2_MPI
75 #define ENABLE_ZOLTAN_MIGRATION
76 #include "zoltan_comm_cpp.h"
77 #include "zoltan_types.h" // for error codes
78 #endif
79 #endif
80 
81 #ifdef HAVE_ZOLTAN2_OMP
82 #include <omp.h>
83 #endif
84 
85 #define LEAST_SIGNIFICANCE 0.0001
86 #define SIGNIFICANCE_MUL 1000
87 
88 //if the (last dimension reduce all count) x the mpi world size
89 //estimated to be bigger than this number then migration will be forced
90 //in earlier iterations.
91 #define FUTURE_REDUCEALL_CUTOFF 1500000
92 //if parts right before last dimension are estimated to have less than
93 //MIN_WORK_LAST_DIM many coords, migration will be forced in earlier iterations.
94 #define MIN_WORK_LAST_DIM 1000
95 
96 
97 
98 
99 #define ZOLTAN2_ABS(x) ((x) >= 0 ? (x) : -(x))
100 //imbalance calculation. Wreal / Wexpected - 1
101 #define imbalanceOf(Wachieved, totalW, expectedRatio) \
102  (Wachieved) / ((totalW) * (expectedRatio)) - 1
103 #define imbalanceOf2(Wachieved, wExpected) \
104  (Wachieved) / (wExpected) - 1
105 
106 
107 #define ZOLTAN2_ALGMULTIJAGGED_SWAP(a,b,temp) temp=(a);(a)=(b);(b)=temp;
108 
109 
110 namespace Teuchos{
111 
116 template <typename Ordinal, typename T>
117 class Zoltan2_BoxBoundaries : public ValueTypeReductionOp<Ordinal,T>
118 {
119 private:
120  Ordinal size;
121  T _EPSILON;
122 
123 public:
126  Zoltan2_BoxBoundaries ():size(0), _EPSILON (std::numeric_limits<T>::epsilon()){}
127 
134  Zoltan2_BoxBoundaries (Ordinal s_):
135  size(s_), _EPSILON (std::numeric_limits<T>::epsilon()){}
136 
139  void reduce( const Ordinal count, const T inBuffer[], T inoutBuffer[]) const
140  {
141  for (Ordinal i=0; i < count; i++){
142  if (Z2_ABS(inBuffer[i]) > _EPSILON){
143  inoutBuffer[i] = inBuffer[i];
144  }
145  }
146  }
147 };
148 } // namespace Teuchos
149 
150 namespace Zoltan2{
151 
155 template <typename T>
156 T *allocMemory(size_t size){
157  if (size > 0){
158  T * a = new T[size];
159  if (a == NULL) {
160  throw "cannot allocate memory";
161  }
162  return a;
163  }
164  else {
165  return NULL;
166  }
167 }
168 
172 template <typename T>
173 void freeArray(T *&array){
174  if(array != NULL){
175  delete [] array;
176  array = NULL;
177  }
178 }
179 
180 
188 template <typename IT, typename CT, typename WT>
190 {
191 public:
192  IT index;
193  CT count;
194  //unsigned int val;
195  WT *val;
197 
199  this->index = 0;
200  this->count = 0;
201  this->val = NULL;
202  this->_EPSILON = std::numeric_limits<WT>::epsilon() * 100;
203  }
204 
205 
206  uMultiSortItem(IT index_ ,CT count_, WT *vals_){
207  this->index = index_;
208  this->count = count_;
209  this->val = vals_;
210  this->_EPSILON = std::numeric_limits<WT>::epsilon() * 100;
211  }
212 
214  this->index = other.index;
215  this->count = other.count;
216  this->val = other.val;
217  this->_EPSILON = other._EPSILON;
218  }
219 
221  //freeArray<WT>(this->val);
222  }
223 
224  void set(IT index_ ,CT count_, WT *vals_){
225  this->index = index_;
226  this->count = count_;
227  this->val = vals_;
228  }
229 
230 
232  this->index = other.index;
233  this->count = other.count;
234  this->val = other.val;
235  return *(this);
236  }
237 
238  bool operator<(const uMultiSortItem<IT,CT,WT>& other) const{
239  assert (this->count == other.count);
240  for(CT i = 0; i < this->count; ++i){
241  //if the values are equal go to next one.
242  if (ZOLTAN2_ABS(this->val[i] - other.val[i]) < this->_EPSILON){
243  continue;
244  }
245  //if next value is smaller return true;
246  if(this->val[i] < other.val[i]){
247  return true;
248  }
249  //if next value is bigger return false;
250  else {
251  return false;
252  }
253  }
254  //if they are totally equal.
255  return this->index < other.index;
256  }
257  bool operator>(const uMultiSortItem<IT,CT,WT>& other) const{
258  assert (this->count == other.count);
259  for(CT i = 0; i < this->count; ++i){
260  //if the values are equal go to next one.
261  if (ZOLTAN2_ABS(this->val[i] - other.val[i]) < this->_EPSILON){
262  continue;
263  }
264  //if next value is bigger return true;
265  if(this->val[i] > other.val[i]){
266  return true;
267  }
268  //if next value is smaller return false;
269  else //(this->val[i] > other.val[i])
270  {
271  return false;
272  }
273  }
274  //if they are totally equal.
275  return this->index > other.index;
276  }
277 };// uSortItem;
278 
282 template <class IT, class WT>
283 struct uSortItem
284 {
285  IT id;
286  //unsigned int val;
287  WT val;
288 };// uSortItem;
289 
293 template <class IT, class WT>
294 void uqsort(IT n, uSortItem<IT, WT> * arr)
295 {
296 
297  int NSTACK = 50;
298  int M = 7;
299  IT i, ir=n, j, k, l=1;
300  IT jstack=0, istack[50];
301  WT aval;
302  uSortItem<IT,WT> a, temp;
303 
304  --arr;
305  for (;;)
306  {
307  if (ir-l < M)
308  {
309  for (j=l+1;j<=ir;j++)
310  {
311  a=arr[j];
312  aval = a.val;
313  for (i=j-1;i>=1;i--)
314  {
315  if (arr[i].val <= aval)
316  break;
317  arr[i+1] = arr[i];
318  }
319  arr[i+1]=a;
320  }
321  if (jstack == 0)
322  break;
323  ir=istack[jstack--];
324  l=istack[jstack--];
325  }
326  else
327  {
328  k=(l+ir) >> 1;
329 
330  ZOLTAN2_ALGMULTIJAGGED_SWAP(arr[k],arr[l+1], temp)
331  if (arr[l+1].val > arr[ir].val)
332  {
333  ZOLTAN2_ALGMULTIJAGGED_SWAP(arr[l+1],arr[ir],temp)
334  }
335  if (arr[l].val > arr[ir].val)
336  {
337  ZOLTAN2_ALGMULTIJAGGED_SWAP(arr[l],arr[ir],temp)
338  }
339  if (arr[l+1].val > arr[l].val)
340  {
341  ZOLTAN2_ALGMULTIJAGGED_SWAP(arr[l+1],arr[l],temp)
342  }
343  i=l+1;
344  j=ir;
345  a=arr[l];
346  aval = a.val;
347  for (;;)
348  {
349  do i++; while (arr[i].val < aval);
350  do j--; while (arr[j].val > aval);
351  if (j < i) break;
352  ZOLTAN2_ALGMULTIJAGGED_SWAP(arr[i],arr[j],temp);
353  }
354  arr[l]=arr[j];
355  arr[j]=a;
356  jstack += 2;
357  if (jstack > NSTACK){
358  std::cout << "uqsort: NSTACK too small in sort." << std::endl;
359  exit(1);
360  }
361  if (ir-i+1 >= j-l)
362  {
363  istack[jstack]=ir;
364  istack[jstack-1]=i;
365  ir=j-1;
366  }
367  else
368  {
369  istack[jstack]=j-1;
370  istack[jstack-1]=l;
371  l=i;
372  }
373  }
374  }
375 }
376 
377 template <class IT, class WT, class SIGN>
379 {
380  IT id;
381  //unsigned int val;
382  WT val;
383  SIGN signbit; // 1 means positive, 0 means negative.
384  bool operator<(const uSignedSortItem<IT, WT, SIGN>& rhs) const {
385  /*if I am negative, the other is positive*/
386  if (this->signbit < rhs.signbit){
387  return true;
388  }
389  /*if both has the same sign*/
390  else if (this->signbit == rhs.signbit){
391 
392  if (this->val < rhs.val){//if my value is smaller,
393  return this->signbit;//then if we both are positive return true.
394  //if we both are negative, return false.
395  }
396  else if (this->val > rhs.val){//if my value is larger,
397  return !this->signbit; //then if we both are positive return false.
398  //if we both are negative, return true.
399  }
400  else { //if both are equal.
401  return false;
402  }
403  }
404  else {
405  /*if I am positive, the other is negative*/
406  return false;
407  }
408 
409  }
410  bool operator>(const uSignedSortItem<IT, WT, SIGN>& rhs) const {
411  /*if I am positive, the other is negative*/
412  if (this->signbit > rhs.signbit){
413  return true;
414  }
415  /*if both has the same sign*/
416  else if (this->signbit == rhs.signbit){
417 
418  if (this->val < rhs.val){//if my value is smaller,
419  return !this->signbit;//then if we both are positive return false.
420  //if we both are negative, return true.
421  }
422  else if (this->val > rhs.val){//if my value is larger,
423  return this->signbit; //then if we both are positive return true.
424  //if we both are negative, return false.
425  }
426  else { // if they are equal
427  return false;
428  }
429  }
430  else {
431  /*if I am negative, the other is positive*/
432  return false;
433  }
434  }
435  bool operator<=(const uSignedSortItem<IT, WT, SIGN>& rhs){
436  return !(*this > rhs);}
438  return !(*this < rhs);}
439 };
440 
444 template <class IT, class WT, class SIGN>
446 
447  IT NSTACK = 50;
448  IT M = 7;
449  IT i, ir=n, j, k, l=1;
450  IT jstack=0, istack[50];
452 
453  --arr;
454  for (;;)
455  {
456  if (ir < M + l)
457  {
458  for (j=l+1;j<=ir;j++)
459  {
460  a=arr[j];
461  for (i=j-1;i>=1;i--)
462  {
463  if (arr[i] <= a)
464  {
465  break;
466  }
467  arr[i+1] = arr[i];
468  }
469  arr[i+1]=a;
470  }
471  if (jstack == 0)
472  break;
473  ir=istack[jstack--];
474  l=istack[jstack--];
475  }
476  else
477  {
478  k=(l+ir) >> 1;
479  ZOLTAN2_ALGMULTIJAGGED_SWAP(arr[k],arr[l+1], temp)
480  if (arr[l+1] > arr[ir])
481  {
482  ZOLTAN2_ALGMULTIJAGGED_SWAP(arr[l+1],arr[ir],temp)
483  }
484  if (arr[l] > arr[ir])
485  {
486  ZOLTAN2_ALGMULTIJAGGED_SWAP(arr[l],arr[ir],temp)
487  }
488  if (arr[l+1] > arr[l])
489  {
490  ZOLTAN2_ALGMULTIJAGGED_SWAP(arr[l+1],arr[l],temp)
491  }
492  i=l+1;
493  j=ir;
494  a=arr[l];
495  for (;;)
496  {
497  do i++; while (arr[i] < a);
498  do j--; while (arr[j] > a);
499  if (j < i) break;
500  ZOLTAN2_ALGMULTIJAGGED_SWAP(arr[i],arr[j],temp);
501  }
502  arr[l]=arr[j];
503  arr[j]=a;
504  jstack += 2;
505  if (jstack > NSTACK){
506  std::cout << "uqsort: NSTACK too small in sort." << std::endl;
507  exit(1);
508  }
509  if (ir+l+1 >= j+i)
510  {
511  istack[jstack]=ir;
512  istack[jstack-1]=i;
513  ir=j-1;
514  }
515  else
516  {
517  istack[jstack]=j-1;
518  istack[jstack-1]=l;
519  l=i;
520  }
521  }
522  }
523 }
524 
528 template <typename mj_scalar_t, typename mj_lno_t, typename mj_gno_t,
529  typename mj_part_t>
530 class AlgMJ
531 {
532 private:
534  typedef std::vector<mj_partBox_t> mj_partBoxVector_t;
535 
536  RCP<const Environment> mj_env; //the environment object
537  RCP<const Comm<int> > mj_problemComm; //initial comm object
538 
539  double imbalance_tolerance; //input imbalance tolerance.
540  mj_part_t *part_no_array; //input part array specifying num part to divide along each dim.
541  int recursion_depth; //the number of steps that partitioning will be solved in.
542  int coord_dim, num_weights_per_coord; //coordinate dim and # of weights per coord
543 
544  size_t initial_num_loc_coords; //initial num local coords.
545  global_size_t initial_num_glob_coords; //initial num global coords.
546 
547  mj_lno_t num_local_coords; //number of local coords.
548  mj_gno_t num_global_coords; //number of global coords.
549 
550  mj_scalar_t **mj_coordinates; //two dimension coordinate array
551  mj_scalar_t **mj_weights; //two dimension weight array
552  bool *mj_uniform_parts; //if the target parts are uniform
553  mj_scalar_t **mj_part_sizes; //target part weight sizes.
554  bool *mj_uniform_weights; //if the coordinates have uniform weights.
555 
556  ArrayView<const mj_gno_t> mj_gnos; //global ids of the coordinates, comes from the input
557  size_t num_global_parts; //the targeted number of parts
558 
559  mj_gno_t *initial_mj_gnos; //initial global ids of the coordinates.
560  mj_gno_t *current_mj_gnos; //current global ids of the coordinates, might change during migration.
561  int *owner_of_coordinate; //the actual processor owner of the coordinate, to track after migrations.
562 
563  mj_lno_t *coordinate_permutations; //permutation of coordinates, for partitioning.
564  mj_lno_t *new_coordinate_permutations; //permutation work array.
565  mj_part_t *assigned_part_ids; //the part ids assigned to coordinates.
566 
567  mj_lno_t *part_xadj; //beginning and end of each part.
568  mj_lno_t *new_part_xadj; // work array for beginning and end of each part.
569 
570  //get mj specific parameters.
571  bool distribute_points_on_cut_lines; //if partitioning can distribute points on same coordiante to different parts.
572  mj_part_t max_concurrent_part_calculation; // how many parts we can calculate concurrently.
573 
574  bool mj_run_as_rcb; //if this is set, then recursion depth is adjusted to its maximum value.
575  int mj_user_recursion_depth; //the recursion depth value provided by user.
576  bool mj_keep_part_boxes; //if the boxes need to be kept.
577 
578  int check_migrate_avoid_migration_option; //whether to migrate=1, avoid migrate=2, or leave decision to MJ=0
579  mj_scalar_t minimum_migration_imbalance; //when MJ decides whether to migrate, the minimum imbalance for migration.
580  int num_threads; //num threads
581 
582  mj_part_t total_num_cut ; //how many cuts will be totally
583  mj_part_t total_num_part; //how many parts will be totally
584 
585  mj_part_t max_num_part_along_dim ; //maximum part count along a dimension.
586  mj_part_t max_num_cut_along_dim; //maximum cut count along a dimension.
587  size_t max_num_total_part_along_dim; //maximum part+cut count along a dimension.
588 
589  mj_part_t total_dim_num_reduce_all; //estimate on #reduceAlls can be done.
590  mj_part_t last_dim_num_part; //max no of parts that might occur
591  //during the partition before the
592  //last partitioning dimension.
593 
594  RCP<Comm<int> > comm; //comm object than can be altered during execution
595  float fEpsilon; //epsilon for float
596  mj_scalar_t sEpsilon; //epsilon for mj_scalar_t
597 
598  mj_scalar_t maxScalar_t; //max possible scalar
599  mj_scalar_t minScalar_t; //min scalar
600 
601  mj_scalar_t *all_cut_coordinates;
602  mj_scalar_t *max_min_coords;
603  mj_scalar_t *process_cut_line_weight_to_put_left; //how much weight should a MPI put left side of the each cutline
604  mj_scalar_t **thread_cut_line_weight_to_put_left; //how much weight percentage should each thread in MPI put left side of the each outline
605 
606  // work array to manipulate coordinate of cutlines in different iterations.
607  //necessary because previous cut line information is used for determining
608  //the next cutline information. therefore, cannot update the cut work array
609  //until all cutlines are determined.
610  mj_scalar_t *cut_coordinates_work_array;
611 
612  //cumulative part weight array.
613  mj_scalar_t *target_part_weights;
614 
615  mj_scalar_t *cut_upper_bound_coordinates ; //upper bound coordinate of a cut line
616  mj_scalar_t *cut_lower_bound_coordinates ; //lower bound coordinate of a cut line
617  mj_scalar_t *cut_lower_bound_weights ; //lower bound weight of a cut line
618  mj_scalar_t *cut_upper_bound_weights ; //upper bound weight of a cut line
619 
620  mj_scalar_t *process_local_min_max_coord_total_weight ; //combined array to exchange the min and max coordinate, and total weight of part.
621  mj_scalar_t *global_min_max_coord_total_weight ;//global combined array with the results for min, max and total weight.
622 
623  //isDone is used to determine if a cutline is determined already.
624  //If a cut line is already determined, the next iterations will skip this cut line.
625  bool *is_cut_line_determined;
626  //my_incomplete_cut_count count holds the number of cutlines that have not been finalized for each part
627  //when concurrentPartCount>1, using this information, if my_incomplete_cut_count[x]==0, then no work is done for this part.
628  mj_part_t *my_incomplete_cut_count;
629  //local part weights of each thread.
630  double **thread_part_weights;
631  //the work manupulation array for partweights.
632  double **thread_part_weight_work;
633 
634  //thread_cut_left_closest_point to hold the closest coordinate to a cutline from left (for each thread).
635  mj_scalar_t **thread_cut_left_closest_point;
636  //thread_cut_right_closest_point to hold the closest coordinate to a cutline from right (for each thread)
637  mj_scalar_t **thread_cut_right_closest_point;
638 
639  //to store how many points in each part a thread has.
640  mj_lno_t **thread_point_counts;
641 
642  mj_scalar_t *process_rectilinear_cut_weight;
643  mj_scalar_t *global_rectilinear_cut_weight;
644 
645  //for faster communication, concatanation of
646  //totalPartWeights sized 2P-1, since there are P parts and P-1 cut lines
647  //leftClosest distances sized P-1, since P-1 cut lines
648  //rightClosest distances size P-1, since P-1 cut lines.
649  mj_scalar_t *total_part_weight_left_right_closests ;
650  mj_scalar_t *global_total_part_weight_left_right_closests;
651 
652  RCP<mj_partBoxVector_t> kept_boxes; // vector of all boxes for all parts;
653  // constructed only if
654  // mj_keep_part_boxes == true
655  RCP<mj_partBox_t> global_box;
656  int myRank, myActualRank; //processor rank, and initial rank
657 
658  /* \brief Either the mj array (part_no_array) or num_global_parts should be provided in
659  * the input. part_no_array takes
660  * precedence if both are provided.
661  * Depending on these parameters, total cut/part number,
662  * maximum part/cut number along a dimension, estimated number of reduceAlls,
663  * and the number of parts before the last dimension is calculated.
664  * */
665  void set_part_specifications();
666 
667  /* \brief Tries to determine the part number for current dimension,
668  * by trying to make the partitioning as square as possible.
669  * \param num_total_future how many more partitionings are required.
670  * \param root how many more recursion depth is left.
671  */
672  inline mj_part_t get_part_count(
673  mj_part_t num_total_future,
674  double root);
675 
676  /* \brief Allocates the all required memory for the mj partitioning algorithm.
677  *
678  */
679  void allocate_set_work_memory();
680 
681  /* \brief for part communication we keep track of the box boundaries.
682  * This is performed when either asked specifically, or when geometric mapping is performed afterwards.
683  * This function initializes a single box with all global min and max coordinates.
684  * \param initial_partitioning_boxes the input and output vector for boxes.
685  */
686  void init_part_boxes(RCP<mj_partBoxVector_t> & outPartBoxes);
687 
688  /* \brief compute global bounding box: min/max coords of global domain */
689  void compute_global_box();
690 
691  /* \brief Function returns how many parts that will be obtained after this dimension partitioning.
692  * It sets how many parts each current part will be partitioned into in this dimension to num_partitioning_in_current_dim vector,
693  * sets how many total future parts each obtained part will be partitioned into in next_future_num_parts_in_parts vector,
694  * If part boxes are kept, then sets initializes the output_part_boxes as its ancestor.
695  *
696  * \param num_partitioning_in_current_dim: output. How many parts each current part will be partitioned into.
697  * \param future_num_part_in_parts: input, how many future parts each current part will be partitioned into.
698  * \param next_future_num_parts_in_parts: output, how many future parts each obtained part will be partitioned into.
699  * \param future_num_parts: output, max number of future parts that will be obtained from a single
700  * \param current_num_parts: input, how many parts are there currently.
701  * \param current_iteration: input, current dimension iteration number.
702  * \param input_part_boxes: input, if boxes are kept, current boxes.
703  * \param output_part_boxes: output, if boxes are kept, the initial box boundaries for obtained parts.
704  */
705  mj_part_t update_part_num_arrays(
706  std::vector<mj_part_t> &num_partitioning_in_current_dim, //assumes this vector is empty.
707  std::vector<mj_part_t> *future_num_part_in_parts,
708  std::vector<mj_part_t> *next_future_num_parts_in_parts, //assumes this vector is empty.
709  mj_part_t &future_num_parts,
710  mj_part_t current_num_parts,
711  int current_iteration,
712  RCP<mj_partBoxVector_t> input_part_boxes,
713  RCP<mj_partBoxVector_t> output_part_boxes);
714 
726  void mj_get_local_min_max_coord_totW(
727  mj_lno_t coordinate_begin_index,
728  mj_lno_t coordinate_end_index,
729  mj_lno_t *mj_current_coordinate_permutations,
730  mj_scalar_t *mj_current_dim_coords,
731  mj_scalar_t &min_coordinate,
732  mj_scalar_t &max_coordinate,
733  mj_scalar_t &total_weight);
734 
742  void mj_get_global_min_max_coord_totW(
743  mj_part_t current_concurrent_num_parts,
744  mj_scalar_t *local_min_max_total,
745  mj_scalar_t *global_min_max_total);
746 
765  void mj_get_initial_cut_coords_target_weights(
766  mj_scalar_t min_coord,
767  mj_scalar_t max_coord,
768  mj_part_t num_cuts/*p-1*/ ,
769  mj_scalar_t global_weight,
770  mj_scalar_t *initial_cut_coords /*p - 1 sized, coordinate of each cut line*/,
771  mj_scalar_t *target_part_weights /*cumulative weights, at left side of each cut line. p-1 sized*/,
772 
773  std::vector <mj_part_t> *future_num_part_in_parts, //the vecto
774  std::vector <mj_part_t> *next_future_num_parts_in_parts,
775  mj_part_t concurrent_current_part,
776  mj_part_t obtained_part_index);
777 
790  void set_initial_coordinate_parts(
791  mj_scalar_t &max_coordinate,
792  mj_scalar_t &min_coordinate,
793  mj_part_t &concurrent_current_part_index,
794  mj_lno_t coordinate_begin_index,
795  mj_lno_t coordinate_end_index,
796  mj_lno_t *mj_current_coordinate_permutations,
797  mj_scalar_t *mj_current_dim_coords,
798  mj_part_t *mj_part_ids,
799  mj_part_t &partition_count);
800 
811  void mj_1D_part(
812  mj_scalar_t *mj_current_dim_coords,
813  mj_scalar_t imbalanceTolerance,
814  mj_part_t current_work_part,
815  mj_part_t current_concurrent_num_parts,
816  mj_scalar_t *current_cut_coordinates,
817  mj_part_t total_incomplete_cut_count,
818  std::vector <mj_part_t> &num_partitioning_in_current_dim);
819 
839  void mj_1D_part_get_thread_part_weights(
840  size_t total_part_count,
841  mj_part_t num_cuts,
842  mj_scalar_t max_coord,
843  mj_scalar_t min_coord,
844  mj_lno_t coordinate_begin_index,
845  mj_lno_t coordinate_end_index,
846  mj_scalar_t *mj_current_dim_coords,
847  mj_scalar_t *temp_current_cut_coords,
848  bool *current_cut_status,
849  double *my_current_part_weights,
850  mj_scalar_t *my_current_left_closest,
851  mj_scalar_t *my_current_right_closest);
852 
860  void mj_accumulate_thread_results(
861  const std::vector <mj_part_t> &num_partitioning_in_current_dim,
862  mj_part_t current_work_part,
863  mj_part_t current_concurrent_num_parts);
864 
895  void mj_get_new_cut_coordinates(
896  const size_t &num_total_part,
897  const mj_part_t &num_cuts,
898  const mj_scalar_t &max_coordinate,
899  const mj_scalar_t &min_coordinate,
900  const mj_scalar_t &global_total_weight,
901  const mj_scalar_t &used_imbalance_tolerance,
902  mj_scalar_t * current_global_part_weights,
903  const mj_scalar_t * current_local_part_weights,
904  const mj_scalar_t *current_part_target_weights,
905  bool *current_cut_line_determined,
906  mj_scalar_t *current_cut_coordinates,
907  mj_scalar_t *current_cut_upper_bounds,
908  mj_scalar_t *current_cut_lower_bounds,
909  mj_scalar_t *current_global_left_closest_points,
910  mj_scalar_t *current_global_right_closest_points,
911  mj_scalar_t * current_cut_lower_bound_weights,
912  mj_scalar_t * current_cut_upper_weights,
913  mj_scalar_t *new_current_cut_coordinates,
914  mj_scalar_t *current_part_cut_line_weight_to_put_left,
915  mj_part_t *rectilinear_cut_count,
916  mj_part_t &my_num_incomplete_cut);
917 
927  void mj_calculate_new_cut_position (
928  mj_scalar_t cut_upper_bound,
929  mj_scalar_t cut_lower_bound,
930  mj_scalar_t cut_upper_weight,
931  mj_scalar_t cut_lower_weight,
932  mj_scalar_t expected_weight,
933  mj_scalar_t &new_cut_position);
934 
945  void mj_create_new_partitions(
946  mj_part_t num_parts,
947  mj_scalar_t *mj_current_dim_coords,
948  mj_scalar_t *current_concurrent_cut_coordinate,
949  mj_lno_t coordinate_begin,
950  mj_lno_t coordinate_end,
951  mj_scalar_t *used_local_cut_line_weight_to_left,
952  double **used_thread_part_weight_work,
953  mj_lno_t *out_part_xadj);
954 
977  bool mj_perform_migration(
978  mj_part_t in_num_parts, //current umb parts
979  mj_part_t &out_num_parts, //output umb parts.
980  std::vector<mj_part_t> *next_future_num_parts_in_parts,
981  mj_part_t &output_part_begin_index,
982  size_t migration_reduce_all_population,
983  mj_lno_t num_coords_for_last_dim_part,
984  std::string iteration,
985  RCP<mj_partBoxVector_t> &input_part_boxes,
986  RCP<mj_partBoxVector_t> &output_part_boxes);
987 
997  void get_processor_num_points_in_parts(
998  mj_part_t num_procs,
999  mj_part_t num_parts,
1000  mj_gno_t *&num_points_in_all_processor_parts);
1001 
1014  bool mj_check_to_migrate(
1015  size_t migration_reduce_all_population,
1016  mj_lno_t num_coords_for_last_dim_part,
1017  mj_part_t num_procs,
1018  mj_part_t num_parts,
1019  mj_gno_t *num_points_in_all_processor_parts);
1020 
1021 
1039  void mj_migration_part_proc_assignment(
1040  mj_gno_t * num_points_in_all_processor_parts,
1041  mj_part_t num_parts,
1042  mj_part_t num_procs,
1043  mj_lno_t *send_count_to_each_proc,
1044  std::vector<mj_part_t> &processor_ranks_for_subcomm,
1045  std::vector<mj_part_t> *next_future_num_parts_in_parts,
1046  mj_part_t &out_num_part,
1047  std::vector<mj_part_t> &out_part_indices,
1048  mj_part_t &output_part_numbering_begin_index,
1049  int *coordinate_destinations);
1050 
1067  void mj_assign_proc_to_parts(
1068  mj_gno_t * num_points_in_all_processor_parts,
1069  mj_part_t num_parts,
1070  mj_part_t num_procs,
1071  mj_lno_t *send_count_to_each_proc,
1072  std::vector<mj_part_t> &processor_ranks_for_subcomm,
1073  std::vector<mj_part_t> *next_future_num_parts_in_parts,
1074  mj_part_t &out_part_index,
1075  mj_part_t &output_part_numbering_begin_index,
1076  int *coordinate_destinations);
1077 
1088  void assign_send_destinations(
1089  mj_part_t num_parts,
1090  mj_part_t *part_assignment_proc_begin_indices,
1091  mj_part_t *processor_chains_in_parts,
1092  mj_lno_t *send_count_to_each_proc,
1093  int *coordinate_destinations);
1094 
1107  void assign_send_destinations2(
1108  mj_part_t num_parts,
1109  uSortItem<mj_part_t, mj_part_t> * sort_item_part_to_proc_assignment, //input sorted wrt processors
1110  int *coordinate_destinations,
1111  mj_part_t &output_part_numbering_begin_index,
1112  std::vector<mj_part_t> *next_future_num_parts_in_parts);
1113 
1130  void mj_assign_parts_to_procs(
1131  mj_gno_t * num_points_in_all_processor_parts,
1132  mj_part_t num_parts,
1133  mj_part_t num_procs,
1134  mj_lno_t *send_count_to_each_proc, //output: sized nprocs, show the number of send point counts to each proc.
1135  std::vector<mj_part_t> *next_future_num_parts_in_parts,//input how many more partitions the part will be partitioned into.
1136  mj_part_t &out_num_part, //output, how many parts the processor will have. this is always 1 for this function.
1137  std::vector<mj_part_t> &out_part_indices, //output: the part indices which the processor is assigned to.
1138  mj_part_t &output_part_numbering_begin_index, //output: how much the part number should be shifted when setting the solution
1139  int *coordinate_destinations);
1140 
1153  void mj_migrate_coords(
1154  mj_part_t num_procs,
1155  mj_lno_t &num_new_local_points,
1156  std::string iteration,
1157  int *coordinate_destinations,
1158  mj_part_t num_parts);
1159 
1166  void create_sub_communicator(std::vector<mj_part_t> &processor_ranks_for_subcomm);
1167 
1168 
1174  void fill_permutation_array(
1175  mj_part_t output_num_parts,
1176  mj_part_t num_parts);
1177 
1186  void set_final_parts(
1187  mj_part_t current_num_parts,
1188  mj_part_t output_part_begin_index,
1189  RCP<mj_partBoxVector_t> &output_part_boxes,
1190  bool is_data_ever_migrated);
1193  void free_work_memory();
1207  void create_consistent_chunks(
1208  mj_part_t num_parts,
1209  mj_scalar_t *mj_current_dim_coords,
1210  mj_scalar_t *current_concurrent_cut_coordinate,
1211  mj_lno_t coordinate_begin,
1212  mj_lno_t coordinate_end,
1213  mj_scalar_t *used_local_cut_line_weight_to_left,
1214  mj_lno_t *out_part_xadj,
1215  int coordInd);
1216 public:
1217  AlgMJ();
1218 
1247  void multi_jagged_part(
1248  const RCP<const Environment> &env,
1249  RCP<const Comm<int> > &problemComm,
1250 
1251  double imbalance_tolerance,
1252  size_t num_global_parts,
1253  mj_part_t *part_no_array,
1254  int recursion_depth,
1255 
1256  int coord_dim,
1257  mj_lno_t num_local_coords,
1258  mj_gno_t num_global_coords,
1259  const mj_gno_t *initial_mj_gnos,
1260  mj_scalar_t **mj_coordinates,
1261 
1262  int num_weights_per_coord,
1263  bool *mj_uniform_weights,
1264  mj_scalar_t **mj_weights,
1265  bool *mj_uniform_parts,
1266  mj_scalar_t **mj_part_sizes,
1267 
1268  mj_part_t *&result_assigned_part_ids,
1269  mj_gno_t *&result_mj_gnos
1270 
1271  );
1280  bool distribute_points_on_cut_lines_,
1281  int max_concurrent_part_calculation_,
1282  int check_migrate_avoid_migration_option_,
1283  mj_scalar_t minimum_migration_imbalance_);
1287  void set_to_keep_part_boxes();
1288 
1291  RCP<mj_partBox_t> get_global_box() const;
1292 
1293  RCP<mj_partBoxVector_t> get_kept_boxes() const;
1294 
1295  RCP<mj_partBoxVector_t> compute_global_box_boundaries(
1296  RCP<mj_partBoxVector_t> &localPartBoxes) const;
1297 
1323  const RCP<const Environment> &env,
1324  mj_lno_t num_total_coords,
1325  mj_lno_t num_selected_coords,
1326  size_t num_target_part,
1327  int coord_dim,
1328  mj_scalar_t **mj_coordinates,
1329  mj_lno_t *initial_selected_coords_output_permutation,
1330  mj_lno_t *output_xadj,
1331  int recursion_depth,
1332  const mj_part_t *part_no_array,
1333  bool partition_along_longest_dim);
1334 
1335 };
1336 
1361 template <typename mj_scalar_t, typename mj_lno_t, typename mj_gno_t,
1362  typename mj_part_t>
1364  const RCP<const Environment> &env,
1365  mj_lno_t num_total_coords,
1366  mj_lno_t num_selected_coords,
1367  size_t num_target_part,
1368  int coord_dim_,
1369  mj_scalar_t **mj_coordinates_,
1370  mj_lno_t *inital_adjList_output_adjlist,
1371  mj_lno_t *output_xadj,
1372  int rd,
1373  const mj_part_t *part_no_array_,
1374  bool partition_along_longest_dim
1375 ){
1376 
1377  this->mj_env = env;
1378  const RCP<Comm<int> > commN;
1379  this->mj_problemComm =
1380  Teuchos::DefaultComm<int>::getDefaultSerialComm(commN);
1381  this->comm =
1382  Teuchos::rcp_const_cast<Comm<int> >(this->mj_problemComm);
1383  this->myActualRank = this->myRank = 1;
1384 
1385 #ifdef HAVE_ZOLTAN2_OMP
1386  int actual_num_threads = omp_get_num_threads();
1387  omp_set_num_threads(1);
1388 #endif
1389 
1390  //weights are uniform for task mapping
1391 
1392  //parts are uniform for task mapping
1393  //as input indices.
1394 
1395  this->imbalance_tolerance = 0;
1396  this->num_global_parts = num_target_part;
1397  this->part_no_array = (mj_part_t *)part_no_array_;
1398  this->recursion_depth = rd;
1399 
1400  this->coord_dim = coord_dim_;
1401  this->num_local_coords = num_total_coords;
1402  this->num_global_coords = num_total_coords;
1403  this->mj_coordinates = mj_coordinates_; //will copy the memory to this->mj_coordinates.
1404 
1407  this->initial_mj_gnos = allocMemory<mj_gno_t>(this->num_local_coords);
1408 
1409  this->num_weights_per_coord = 0;
1410  bool *tmp_mj_uniform_weights = new bool[1];
1411  this->mj_uniform_weights = tmp_mj_uniform_weights ;
1412  this->mj_uniform_weights[0] = true;
1413 
1414  mj_scalar_t **tmp_mj_weights = new mj_scalar_t *[1];
1415  this->mj_weights = tmp_mj_weights; //will copy the memory to this->mj_weights
1416 
1417  bool *tmp_mj_uniform_parts = new bool[1];
1418  this->mj_uniform_parts = tmp_mj_uniform_parts;
1419  this->mj_uniform_parts[0] = true;
1420 
1421  mj_scalar_t **tmp_mj_part_sizes = new mj_scalar_t * [1];
1422  this->mj_part_sizes = tmp_mj_part_sizes;
1423  this->mj_part_sizes[0] = NULL;
1424 
1425  this->num_threads = 1;
1426  this->set_part_specifications();
1427 
1428  this->allocate_set_work_memory();
1429  //the end of the initial partition is the end of coordinates.
1430  this->part_xadj[0] = static_cast<mj_lno_t>(num_selected_coords);
1431  for(size_t i = 0; i < static_cast<size_t>(num_total_coords); ++i){
1432  this->coordinate_permutations[i] = inital_adjList_output_adjlist[i];
1433  }
1434 
1435  mj_part_t current_num_parts = 1;
1436 
1437  mj_scalar_t *current_cut_coordinates = this->all_cut_coordinates;
1438 
1439  mj_part_t future_num_parts = this->total_num_part;
1440 
1441  std::vector<mj_part_t> *future_num_part_in_parts = new std::vector<mj_part_t> ();
1442  std::vector<mj_part_t> *next_future_num_parts_in_parts = new std::vector<mj_part_t> ();
1443  next_future_num_parts_in_parts->push_back(this->num_global_parts);
1444  RCP<mj_partBoxVector_t> t1;
1445  RCP<mj_partBoxVector_t> t2;
1446 
1447 
1448  std::vector <uSignedSortItem<int, mj_scalar_t, char> > coord_dimension_range_sorted(this->coord_dim);
1449  uSignedSortItem<int, mj_scalar_t, char> *p_coord_dimension_range_sorted = &(coord_dimension_range_sorted[0]);
1450  std::vector <mj_scalar_t> coord_dim_mins(this->coord_dim);
1451  std::vector <mj_scalar_t> coord_dim_maxs(this->coord_dim);
1452 
1453  for (int i = 0; i < this->recursion_depth; ++i){
1454 
1455  //partitioning array. size will be as the number of current partitions and this
1456  //holds how many parts that each part will be in the current dimension partitioning.
1457  std::vector <mj_part_t> num_partitioning_in_current_dim;
1458 
1459  //number of parts that will be obtained at the end of this partitioning.
1460  //future_num_part_in_parts is as the size of current number of parts.
1461  //holds how many more parts each should be divided in the further
1462  //iterations. this will be used to calculate num_partitioning_in_current_dim,
1463  //as the number of parts that the part will be partitioned
1464  //in the current dimension partitioning.
1465 
1466  //next_future_num_parts_in_parts will be as the size of outnumParts,
1467  //and this will hold how many more parts that each output part
1468  //should be divided. this array will also be used to determine the weight ratios
1469  //of the parts.
1470  //swap the arrays to use iteratively..
1471  std::vector<mj_part_t> *tmpPartVect= future_num_part_in_parts;
1472  future_num_part_in_parts = next_future_num_parts_in_parts;
1473  next_future_num_parts_in_parts = tmpPartVect;
1474 
1475  //clear next_future_num_parts_in_parts array as
1476  //getPartitionArrays expects it to be empty.
1477  //it also expects num_partitioning_in_current_dim to be empty as well.
1478  next_future_num_parts_in_parts->clear();
1479 
1480 
1481  //returns the total number of output parts for this dimension partitioning.
1482  mj_part_t output_part_count_in_dimension =
1483  this->update_part_num_arrays(
1484  num_partitioning_in_current_dim,
1485  future_num_part_in_parts,
1486  next_future_num_parts_in_parts,
1487  future_num_parts,
1488  current_num_parts,
1489  i,
1490  t1,
1491  t2);
1492 
1493  //if the number of obtained parts equal to current number of parts,
1494  //skip this dimension. For example, this happens when 1 is given in the input
1495  //part array is given. P=4,5,1,2
1496  if(output_part_count_in_dimension == current_num_parts) {
1497  tmpPartVect= future_num_part_in_parts;
1498  future_num_part_in_parts = next_future_num_parts_in_parts;
1499  next_future_num_parts_in_parts = tmpPartVect;
1500  continue;
1501  }
1502 
1503  //convert i to string to be used for debugging purposes.
1504  std::string istring = Teuchos::toString<int>(i);
1505 
1506  //alloc Memory to point the indices
1507  //of the parts in the permutation array.
1508  this->new_part_xadj = allocMemory<mj_lno_t>(output_part_count_in_dimension);
1509 
1510  //the index where in the outtotalCounts will be written.
1511  mj_part_t output_part_index = 0;
1512  //whatever is written to outTotalCounts will be added with previousEnd
1513  //so that the points will be shifted.
1514  mj_part_t output_coordinate_end_index = 0;
1515 
1516  mj_part_t current_work_part = 0;
1517  mj_part_t current_concurrent_num_parts = 1;
1518 
1519  mj_part_t obtained_part_index = 0;
1520 
1521  //get the coordinate axis along which the partitioning will be done.
1522  int coordInd = i % this->coord_dim;
1523  mj_scalar_t * mj_current_dim_coords = this->mj_coordinates[coordInd];
1524 
1525 
1526  //run for all available parts.
1527  for (; current_work_part < current_num_parts;
1528  current_work_part += current_concurrent_num_parts){
1529 
1530 
1531  //current_concurrent_num_parts = std::min(current_num_parts - current_work_part,
1532  //this->max_concurrent_part_calculation);
1533 
1534  mj_part_t actual_work_part_count = 0;
1535  //initialization for 1D partitioning.
1536  //get the min and max coordinates of each part
1537  //together with the part weights of each part.
1538  for(int kk = 0; kk < current_concurrent_num_parts; ++kk){
1539  mj_part_t current_work_part_in_concurrent_parts = current_work_part + kk;
1540 
1541  //if this part wont be partitioned any further
1542  //dont do any work for this part.
1543  if (num_partitioning_in_current_dim[current_work_part_in_concurrent_parts] == 1){
1544  continue;
1545  }
1546  ++actual_work_part_count;
1547  mj_lno_t coordinate_end_index= this->part_xadj[current_work_part_in_concurrent_parts];
1548  mj_lno_t coordinate_begin_index = current_work_part_in_concurrent_parts==0 ? 0: this->part_xadj[current_work_part_in_concurrent_parts -1];
1549 
1550  /*
1551  std::cout << "i:" << i << " j:" << current_work_part + kk
1552  << " coordinate_begin_index:" << coordinate_begin_index
1553  << " coordinate_end_index:" << coordinate_end_index
1554  << " total:" << coordinate_end_index - coordinate_begin_index<< std::endl;
1555  */
1556 
1557 
1558  if(partition_along_longest_dim){
1559 
1560  mj_scalar_t best_weight_coord = 0;
1561  for (int coord_traverse_ind = 0; coord_traverse_ind < this->coord_dim; ++coord_traverse_ind){
1562  mj_scalar_t best_min_coord = 0;
1563  mj_scalar_t best_max_coord = 0;
1564  //MD:same for all coordinates, but I will still use this for now.
1565 
1566  this->mj_get_local_min_max_coord_totW(
1567  coordinate_begin_index,
1568  coordinate_end_index,
1569  this->coordinate_permutations,
1570  this->mj_coordinates[coord_traverse_ind],
1571  best_min_coord, //min coordinate
1572  best_max_coord, //max coordinate
1573  best_weight_coord //total weight);
1574  );
1575 
1576  coord_dim_mins[coord_traverse_ind] = best_min_coord;
1577  coord_dim_maxs[coord_traverse_ind] = best_max_coord;
1578  mj_scalar_t best_range = best_max_coord - best_min_coord;
1579  coord_dimension_range_sorted[coord_traverse_ind].id = coord_traverse_ind;
1580  coord_dimension_range_sorted[coord_traverse_ind].val = best_range;
1581  coord_dimension_range_sorted[coord_traverse_ind].signbit = 1;
1582  }
1583 
1584 
1585  uqSignsort(this->coord_dim, p_coord_dimension_range_sorted);
1586  coordInd = p_coord_dimension_range_sorted[this->coord_dim - 1].id;
1587  /*
1588  for (int coord_traverse_ind = 0; coord_traverse_ind < this->coord_dim; ++coord_traverse_ind){
1589  std::cout << "i:" << p_coord_dimension_range_sorted[coord_traverse_ind].id << " range:" << p_coord_dimension_range_sorted[coord_traverse_ind].val << std::endl;
1590  std::cout << "i:" << p_coord_dimension_range_sorted[coord_traverse_ind].id << " coord_dim_mins:" << coord_dim_mins[p_coord_dimension_range_sorted[coord_traverse_ind].id]<< std::endl;
1591  std::cout << "i:" << p_coord_dimension_range_sorted[coord_traverse_ind].id << " coord_dim_maxs:" << coord_dim_maxs[p_coord_dimension_range_sorted[coord_traverse_ind].id] << std::endl;
1592 
1593  }
1594  */
1595  mj_current_dim_coords = this->mj_coordinates[coordInd];
1596 
1597  this->process_local_min_max_coord_total_weight[kk] = coord_dim_mins[coordInd];
1598  this->process_local_min_max_coord_total_weight[kk+ current_concurrent_num_parts] = coord_dim_maxs[coordInd];
1599  this->process_local_min_max_coord_total_weight[kk + 2*current_concurrent_num_parts] = best_weight_coord;
1600 
1601  }
1602  else{
1603  this->mj_get_local_min_max_coord_totW(
1604  coordinate_begin_index,
1605  coordinate_end_index,
1606  this->coordinate_permutations,
1607  mj_current_dim_coords,
1608  this->process_local_min_max_coord_total_weight[kk], //min coordinate
1609  this->process_local_min_max_coord_total_weight[kk + current_concurrent_num_parts], //max coordinate
1610  this->process_local_min_max_coord_total_weight[kk + 2*current_concurrent_num_parts] //total weight);
1611  );
1612  }
1613  }
1614 
1615  //1D partitioning
1616  if (actual_work_part_count > 0){
1617  //obtain global Min max of the part.
1618  this->mj_get_global_min_max_coord_totW(
1619  current_concurrent_num_parts,
1620  this->process_local_min_max_coord_total_weight,
1621  this->global_min_max_coord_total_weight);
1622 
1623  //represents the total number of cutlines
1624  //whose coordinate should be determined.
1625  mj_part_t total_incomplete_cut_count = 0;
1626 
1627  //Compute weight ratios for parts & cuts:
1628  //e.g., 0.25 0.25 0.5 0.5 0.75 0.75 1
1629  //part0 cut0 part1 cut1 part2 cut2 part3
1630  mj_part_t concurrent_part_cut_shift = 0;
1631  mj_part_t concurrent_part_part_shift = 0;
1632  for(int kk = 0; kk < current_concurrent_num_parts; ++kk){
1633  mj_scalar_t min_coordinate = this->global_min_max_coord_total_weight[kk];
1634  mj_scalar_t max_coordinate = this->global_min_max_coord_total_weight[kk +
1635  current_concurrent_num_parts];
1636  mj_scalar_t global_total_weight =
1637  this->global_min_max_coord_total_weight[kk +
1638  2 * current_concurrent_num_parts];
1639 
1640  mj_part_t concurrent_current_part_index = current_work_part + kk;
1641 
1642  mj_part_t partition_count = num_partitioning_in_current_dim[concurrent_current_part_index];
1643 
1644  mj_scalar_t *usedCutCoordinate = current_cut_coordinates + concurrent_part_cut_shift;
1645  mj_scalar_t *current_target_part_weights = this->target_part_weights +
1646  concurrent_part_part_shift;
1647  //shift the usedCutCoordinate array as noCuts.
1648  concurrent_part_cut_shift += partition_count - 1;
1649  //shift the partRatio array as noParts.
1650  concurrent_part_part_shift += partition_count;
1651 
1652  //calculate only if part is not empty,
1653  //and part will be further partitioend.
1654  if(partition_count > 1 && min_coordinate <= max_coordinate){
1655 
1656  //increase allDone by the number of cuts of the current
1657  //part's cut line number.
1658  total_incomplete_cut_count += partition_count - 1;
1659  //set the number of cut lines that should be determined
1660  //for this part.
1661  this->my_incomplete_cut_count[kk] = partition_count - 1;
1662 
1663  //get the target weights of the parts.
1664  this->mj_get_initial_cut_coords_target_weights(
1665  min_coordinate,
1666  max_coordinate,
1667  partition_count - 1,
1668  global_total_weight,
1669  usedCutCoordinate,
1670  current_target_part_weights,
1671  future_num_part_in_parts,
1672  next_future_num_parts_in_parts,
1673  concurrent_current_part_index,
1674  obtained_part_index);
1675 
1676  mj_lno_t coordinate_end_index= this->part_xadj[concurrent_current_part_index];
1677  mj_lno_t coordinate_begin_index = concurrent_current_part_index==0 ? 0: this->part_xadj[concurrent_current_part_index -1];
1678 
1679  //get the initial estimated part assignments of the coordinates.
1680  this->set_initial_coordinate_parts(
1681  max_coordinate,
1682  min_coordinate,
1683  concurrent_current_part_index,
1684  coordinate_begin_index, coordinate_end_index,
1685  this->coordinate_permutations,
1686  mj_current_dim_coords,
1687  this->assigned_part_ids,
1688  partition_count);
1689  }
1690  else {
1691  // e.g., if have fewer coordinates than parts, don't need to do next dim.
1692  this->my_incomplete_cut_count[kk] = 0;
1693  }
1694  obtained_part_index += partition_count;
1695  }
1696 
1697  //used imbalance, it is always 0, as it is difficult to estimate a range.
1698  mj_scalar_t used_imbalance = 0;
1699 
1700  // Determine cut lines for k parts here.
1701  this->mj_1D_part(
1702  mj_current_dim_coords,
1703  used_imbalance,
1704  current_work_part,
1705  current_concurrent_num_parts,
1706  current_cut_coordinates,
1707  total_incomplete_cut_count,
1708  num_partitioning_in_current_dim);
1709  }
1710 
1711  //create part chunks
1712  {
1713 
1714  mj_part_t output_array_shift = 0;
1715  mj_part_t cut_shift = 0;
1716  size_t tlr_shift = 0;
1717  size_t partweight_array_shift = 0;
1718 
1719  for(int kk = 0; kk < current_concurrent_num_parts; ++kk){
1720  mj_part_t current_concurrent_work_part = current_work_part + kk;
1721  mj_part_t num_parts = num_partitioning_in_current_dim[current_concurrent_work_part];
1722 
1723  //if the part is empty, skip the part.
1724  if((num_parts != 1 ) && this->global_min_max_coord_total_weight[kk] >
1725  this->global_min_max_coord_total_weight[kk + current_concurrent_num_parts]) {
1726 
1727  for(mj_part_t jj = 0; jj < num_parts; ++jj){
1728  this->new_part_xadj[output_part_index + output_array_shift + jj] = 0;
1729  }
1730  cut_shift += num_parts - 1;
1731  tlr_shift += (4 *(num_parts - 1) + 1);
1732  output_array_shift += num_parts;
1733  partweight_array_shift += (2 * (num_parts - 1) + 1);
1734  continue;
1735  }
1736 
1737  mj_lno_t coordinate_end = this->part_xadj[current_concurrent_work_part];
1738  mj_lno_t coordinate_begin = current_concurrent_work_part==0 ? 0: this->part_xadj[current_concurrent_work_part
1739  -1];
1740  mj_scalar_t *current_concurrent_cut_coordinate = current_cut_coordinates + cut_shift;
1741  mj_scalar_t *used_local_cut_line_weight_to_left = this->process_cut_line_weight_to_put_left +
1742  cut_shift;
1743 
1744  for(int ii = 0; ii < this->num_threads; ++ii){
1745  this->thread_part_weight_work[ii] = this->thread_part_weights[ii] + partweight_array_shift;
1746  }
1747 
1748  if(num_parts > 1){
1749  // Rewrite the indices based on the computed cuts.
1750  this->create_consistent_chunks(
1751  num_parts,
1752  mj_current_dim_coords,
1753  current_concurrent_cut_coordinate,
1754  coordinate_begin,
1755  coordinate_end,
1756  used_local_cut_line_weight_to_left,
1757  this->new_part_xadj + output_part_index + output_array_shift,
1758  coordInd );
1759  }
1760  else {
1761  //if this part is partitioned into 1 then just copy
1762  //the old values.
1763  mj_lno_t part_size = coordinate_end - coordinate_begin;
1764  *(this->new_part_xadj + output_part_index + output_array_shift) = part_size;
1765  memcpy(this->new_coordinate_permutations + coordinate_begin,
1766  this->coordinate_permutations + coordinate_begin,
1767  part_size * sizeof(mj_lno_t));
1768  }
1769 
1770 
1771 
1772  cut_shift += num_parts - 1;
1773  tlr_shift += (4 *(num_parts - 1) + 1);
1774  output_array_shift += num_parts;
1775  partweight_array_shift += (2 * (num_parts - 1) + 1);
1776  }
1777 
1778  //shift cut coordinates so that all cut coordinates are stored.
1779  //current_cut_coordinates += cutShift;
1780 
1781  //getChunks from coordinates partitioned the parts and
1782  //wrote the indices as if there were a single part.
1783  //now we need to shift the beginning indices.
1784  for(mj_part_t kk = 0; kk < current_concurrent_num_parts; ++kk){
1785  mj_part_t num_parts = num_partitioning_in_current_dim[ current_work_part + kk];
1786  for (mj_part_t ii = 0;ii < num_parts ; ++ii){
1787  //shift it by previousCount
1788  this->new_part_xadj[output_part_index+ii] += output_coordinate_end_index;
1789  if (ii % 2 == 1){
1790  mj_lno_t coordinate_end = this->new_part_xadj[output_part_index+ii];
1791  mj_lno_t coordinate_begin = this->new_part_xadj[output_part_index];
1792 
1793  for (mj_lno_t task_traverse = coordinate_begin; task_traverse < coordinate_end; ++task_traverse){
1794  mj_lno_t l = this->new_coordinate_permutations[task_traverse];
1795  mj_current_dim_coords[l] = -mj_current_dim_coords[l];
1796  }
1797  }
1798  }
1799  //increase the previous count by current end.
1800  output_coordinate_end_index = this->new_part_xadj[output_part_index + num_parts - 1];
1801  //increase the current out.
1802  output_part_index += num_parts ;
1803  }
1804  }
1805  }
1806  // end of this partitioning dimension
1807 
1808  //set the current num parts for next dim partitioning
1809  current_num_parts = output_part_count_in_dimension;
1810 
1811  //swap the coordinate permutations for the next dimension.
1812  mj_lno_t * tmp = this->coordinate_permutations;
1813  this->coordinate_permutations = this->new_coordinate_permutations;
1814  this->new_coordinate_permutations = tmp;
1815 
1816  freeArray<mj_lno_t>(this->part_xadj);
1817  this->part_xadj = this->new_part_xadj;
1818  this->new_part_xadj = NULL;
1819  }
1820 
1821  for(mj_lno_t i = 0; i < num_total_coords; ++i){
1822  inital_adjList_output_adjlist[i] = this->coordinate_permutations[i];
1823  }
1824 
1825  // Return output_xadj in CSR format
1826  output_xadj[0] = 0;
1827  for(size_t i = 0; i < this->num_global_parts ; ++i){
1828  output_xadj[i+1] = this->part_xadj[i];
1829  }
1830 
1831  delete future_num_part_in_parts;
1832  delete next_future_num_parts_in_parts;
1833 
1834  //free the extra memory that we allocated.
1835  freeArray<mj_part_t>(this->assigned_part_ids);
1836  freeArray<mj_gno_t>(this->initial_mj_gnos);
1837  freeArray<mj_gno_t>(this->current_mj_gnos);
1838  freeArray<bool>(tmp_mj_uniform_weights);
1839  freeArray<bool>(tmp_mj_uniform_parts);
1840  freeArray<mj_scalar_t *>(tmp_mj_weights);
1841  freeArray<mj_scalar_t *>(tmp_mj_part_sizes);
1842 
1843  this->free_work_memory();
1844 
1845 #ifdef HAVE_ZOLTAN2_OMP
1846  omp_set_num_threads(actual_num_threads);
1847 #endif
1848 }
1849 
1853 template <typename mj_scalar_t, typename mj_lno_t, typename mj_gno_t,
1854  typename mj_part_t>
1856  mj_env(), mj_problemComm(), imbalance_tolerance(0),
1857  part_no_array(NULL), recursion_depth(0), coord_dim(0),
1858  num_weights_per_coord(0), initial_num_loc_coords(0),
1859  initial_num_glob_coords(0),
1860  num_local_coords(0), num_global_coords(0), mj_coordinates(NULL),
1861  mj_weights(NULL), mj_uniform_parts(NULL), mj_part_sizes(NULL),
1862  mj_uniform_weights(NULL), mj_gnos(), num_global_parts(1),
1863  initial_mj_gnos(NULL), current_mj_gnos(NULL), owner_of_coordinate(NULL),
1864  coordinate_permutations(NULL), new_coordinate_permutations(NULL),
1865  assigned_part_ids(NULL), part_xadj(NULL), new_part_xadj(NULL),
1866  distribute_points_on_cut_lines(true), max_concurrent_part_calculation(1),
1867  mj_run_as_rcb(false), mj_user_recursion_depth(0), mj_keep_part_boxes(false),
1868  check_migrate_avoid_migration_option(0), minimum_migration_imbalance(0.30),
1869  num_threads(1), total_num_cut(0), total_num_part(0), max_num_part_along_dim(0),
1870  max_num_cut_along_dim(0), max_num_total_part_along_dim(0), total_dim_num_reduce_all(0),
1871  last_dim_num_part(0), comm(), fEpsilon(0), sEpsilon(0), maxScalar_t(0), minScalar_t(0),
1872  all_cut_coordinates(NULL), max_min_coords(NULL), process_cut_line_weight_to_put_left(NULL),
1873  thread_cut_line_weight_to_put_left(NULL), cut_coordinates_work_array(NULL),
1874  target_part_weights(NULL), cut_upper_bound_coordinates(NULL), cut_lower_bound_coordinates(NULL),
1875  cut_lower_bound_weights(NULL), cut_upper_bound_weights(NULL),
1876  process_local_min_max_coord_total_weight(NULL), global_min_max_coord_total_weight(NULL),
1877  is_cut_line_determined(NULL), my_incomplete_cut_count(NULL),
1878  thread_part_weights(NULL), thread_part_weight_work(NULL),
1879  thread_cut_left_closest_point(NULL), thread_cut_right_closest_point(NULL),
1880  thread_point_counts(NULL), process_rectilinear_cut_weight(NULL),
1881  global_rectilinear_cut_weight(NULL),total_part_weight_left_right_closests(NULL),
1882  global_total_part_weight_left_right_closests(NULL),
1883  kept_boxes(),global_box(),
1884  myRank(0), myActualRank(0)
1885 {
1886  this->fEpsilon = std::numeric_limits<float>::epsilon();
1887  this->sEpsilon = std::numeric_limits<mj_scalar_t>::epsilon() * 100;
1888 
1889  this->maxScalar_t = std::numeric_limits<mj_scalar_t>::max();
1890  this->minScalar_t = -std::numeric_limits<mj_scalar_t>::max();
1891 
1892 }
1893 
1894 
1898 template <typename mj_scalar_t, typename mj_lno_t, typename mj_gno_t,
1899  typename mj_part_t>
1900 RCP<typename AlgMJ<mj_scalar_t,mj_lno_t,mj_gno_t,mj_part_t>::mj_partBox_t>
1902 {
1903  return this->global_box;
1904 }
1905 
1909 template <typename mj_scalar_t, typename mj_lno_t, typename mj_gno_t,
1910  typename mj_part_t>
1912  this->mj_keep_part_boxes = true;
1913 }
1914 
1915 
1916 /* \brief Either the mj array (part_no_array) or num_global_parts should be provided in
1917  * the input. part_no_array takes
1918  * precedence if both are provided.
1919  * Depending on these parameters, total cut/part number,
1920  * maximum part/cut number along a dimension, estimated number of reduceAlls,
1921  * and the number of parts before the last dimension is calculated.
1922  * */
1923 template <typename mj_scalar_t, typename mj_lno_t, typename mj_gno_t,
1924  typename mj_part_t>
1926 
1927  this->total_num_cut = 0; //how many cuts will be totally
1928  this->total_num_part = 1; //how many parts will be totally
1929  this->max_num_part_along_dim = 0; //maximum part count along a dimension.
1930  this->total_dim_num_reduce_all = 0; //estimate on #reduceAlls can be done.
1931  this->last_dim_num_part = 1; //max no of parts that might occur
1932  //during the partition before the
1933  //last partitioning dimension.
1934  this->max_num_cut_along_dim = 0;
1935  this->max_num_total_part_along_dim = 0;
1936 
1937  if (this->part_no_array){
1938  //if user provided part array, traverse the array and set variables.
1939  for (int i = 0; i < this->recursion_depth; ++i){
1940  this->total_dim_num_reduce_all += this->total_num_part;
1941  this->total_num_part *= this->part_no_array[i];
1942  if(this->part_no_array[i] > this->max_num_part_along_dim) {
1943  this->max_num_part_along_dim = this->part_no_array[i];
1944  }
1945  }
1946  this->last_dim_num_part = this->total_num_part / this->part_no_array[recursion_depth-1];
1947  this->num_global_parts = this->total_num_part;
1948  } else {
1949  mj_part_t future_num_parts = this->num_global_parts;
1950 
1951  //we need to calculate the part numbers now, to determine the maximum along the dimensions.
1952  for (int i = 0; i < this->recursion_depth; ++i){
1953 
1954  mj_part_t maxNoPartAlongI = this->get_part_count(
1955  future_num_parts, 1.0f / (this->recursion_depth - i));
1956 
1957  if (maxNoPartAlongI > this->max_num_part_along_dim){
1958  this->max_num_part_along_dim = maxNoPartAlongI;
1959  }
1960 
1961  mj_part_t nfutureNumParts = future_num_parts / maxNoPartAlongI;
1962  if (future_num_parts % maxNoPartAlongI){
1963  ++nfutureNumParts;
1964  }
1965  future_num_parts = nfutureNumParts;
1966  }
1967  this->total_num_part = this->num_global_parts;
1968  //estimate reduceAll Count here.
1969  //we find the upperbound instead.
1970  mj_part_t p = 1;
1971  for (int i = 0; i < this->recursion_depth; ++i){
1972  this->total_dim_num_reduce_all += p;
1973  p *= this->max_num_part_along_dim;
1974  }
1975 
1976  this->last_dim_num_part = p / this->max_num_part_along_dim;
1977  }
1978 
1979  this->total_num_cut = this->total_num_part - 1;
1980  this->max_num_cut_along_dim = this->max_num_part_along_dim - 1;
1981  this->max_num_total_part_along_dim = this->max_num_part_along_dim + size_t(this->max_num_cut_along_dim);
1982  //maxPartNo is P, maxCutNo = P-1, matTotalPartcount = 2P-1
1983 
1984  //refine the concurrent part count, if it is given bigger than the maximum possible part count.
1985  if(this->max_concurrent_part_calculation > this->last_dim_num_part){
1986  if(this->mj_problemComm->getRank() == 0){
1987  std::cerr << "Warning: Concurrent part count ("<< this->max_concurrent_part_calculation <<
1988  ") has been set bigger than maximum amount that can be used." <<
1989  " Setting to:" << this->last_dim_num_part << "." << std::endl;
1990  }
1991  this->max_concurrent_part_calculation = this->last_dim_num_part;
1992  }
1993 
1994 }
1995 /* \brief Tries to determine the part number for current dimension,
1996  * by trying to make the partitioning as square as possible.
1997  * \param num_total_future how many more partitionings are required.
1998  * \param root how many more recursion depth is left.
1999  */
2000 template <typename mj_scalar_t, typename mj_lno_t, typename mj_gno_t,
2001  typename mj_part_t>
2002 inline mj_part_t AlgMJ<mj_scalar_t, mj_lno_t, mj_gno_t, mj_part_t>::get_part_count(
2003  mj_part_t num_total_future,
2004  double root)
2005 {
2006  double fp = pow(num_total_future, root);
2007  mj_part_t ip = mj_part_t (fp);
2008  if (fp - ip < this->fEpsilon * 100){
2009  return ip;
2010  }
2011  else {
2012  return ip + 1;
2013  }
2014 }
2015 
2016 /* \brief Function returns how many parts that will be obtained after this dimension partitioning.
2017  * It sets how many parts each current part will be partitioned into in this dimension to num_partitioning_in_current_dim vector,
2018  * sets how many total future parts each obtained part will be partitioned into in next_future_num_parts_in_parts vector,
2019  * If part boxes are kept, then sets initializes the output_part_boxes as its ancestor.
2020  *
2021  * \param num_partitioning_in_current_dim: output. How many parts each current part will be partitioned into.
2022  * \param future_num_part_in_parts: input, how many future parts each current part will be partitioned into.
2023  * \param next_future_num_parts_in_parts: output, how many future parts each obtained part will be partitioned into.
2024  * \param future_num_parts: output, max number of future parts that will be obtained from a single
2025  * \param current_num_parts: input, how many parts are there currently.
2026  * \param current_iteration: input, current dimension iteration number.
2027  * \param input_part_boxes: input, if boxes are kept, current boxes.
2028  * \param output_part_boxes: output, if boxes are kept, the initial box boundaries for obtained parts.
2029  */
2030 template <typename mj_scalar_t, typename mj_lno_t, typename mj_gno_t,
2031  typename mj_part_t>
2032 mj_part_t AlgMJ<mj_scalar_t, mj_lno_t, mj_gno_t, mj_part_t>::update_part_num_arrays(
2033  std::vector <mj_part_t> &num_partitioning_in_current_dim, //assumes this vector is empty.
2034  std::vector<mj_part_t> *future_num_part_in_parts,
2035  std::vector<mj_part_t> *next_future_num_parts_in_parts, //assumes this vector is empty.
2036  mj_part_t &future_num_parts,
2037  mj_part_t current_num_parts,
2038  int current_iteration,
2039  RCP<mj_partBoxVector_t> input_part_boxes,
2040  RCP<mj_partBoxVector_t> output_part_boxes
2041 ){
2042  //how many parts that will be obtained after this dimension.
2043  mj_part_t output_num_parts = 0;
2044  if(this->part_no_array){
2045  //when the partNo array is provided as input,
2046  //each current partition will be partition to the same number of parts.
2047  //we dont need to use the future_num_part_in_parts vector in this case.
2048 
2049  mj_part_t p = this->part_no_array[current_iteration];
2050  if (p < 1){
2051  std::cout << "i:" << current_iteration << " p is given as:" << p << std::endl;
2052  exit(1);
2053  }
2054  if (p == 1){
2055  return current_num_parts;
2056  }
2057 
2058  for (mj_part_t ii = 0; ii < current_num_parts; ++ii){
2059  num_partitioning_in_current_dim.push_back(p);
2060 
2061  }
2062  //cout << "me:" << this->myRank << " current_iteration" << current_iteration <<
2063  //" current_num_parts:" << current_num_parts << std::endl;
2064  //cout << "num_partitioning_in_current_dim[0]:" << num_partitioning_in_current_dim[0] << std::endl;
2065  //set the new value of future_num_parts.
2066 
2067  /*
2068  cout << "\tfuture_num_parts:" << future_num_parts
2069  << " num_partitioning_in_current_dim[0]:" << num_partitioning_in_current_dim[0]
2070  << future_num_parts/ num_partitioning_in_current_dim[0] << std::endl;
2071  */
2072 
2073  future_num_parts /= num_partitioning_in_current_dim[0];
2074  output_num_parts = current_num_parts * num_partitioning_in_current_dim[0];
2075 
2076  if (this->mj_keep_part_boxes){
2077  for (mj_part_t k = 0; k < current_num_parts; ++k){
2078  //initialized the output boxes as its ancestor.
2079  for (mj_part_t j = 0; j < num_partitioning_in_current_dim[0]; ++j){
2080  output_part_boxes->push_back((*input_part_boxes)[k]);
2081  }
2082  }
2083  }
2084 
2085  //set the how many more parts each part will be divided.
2086  //this is obvious when partNo array is provided as input.
2087  //however, fill this so that weights will be calculated according to this array.
2088  for (mj_part_t ii = 0; ii < output_num_parts; ++ii){
2089  next_future_num_parts_in_parts->push_back(future_num_parts);
2090  }
2091  }
2092  else {
2093  //if partNo array is not provided as input,
2094  //future_num_part_in_parts holds how many parts each part should be divided.
2095  //initially it holds a single number equal to the total number of global parts.
2096 
2097  //calculate the future_num_parts from beginning,
2098  //since each part might be divided into different number of parts.
2099  future_num_parts = 1;
2100 
2101  //cout << "i:" << i << std::endl;
2102 
2103  for (mj_part_t ii = 0; ii < current_num_parts; ++ii){
2104  //get how many parts a part should be divided.
2105  mj_part_t future_num_parts_of_part_ii = (*future_num_part_in_parts)[ii];
2106 
2107  //get the ideal number of parts that is close to the
2108  //(recursion_depth - i) root of the future_num_parts_of_part_ii.
2109  mj_part_t num_partitions_in_current_dim =
2110  this->get_part_count(
2111  future_num_parts_of_part_ii,
2112  1.0 / (this->recursion_depth - current_iteration)
2113  );
2114 
2115  if (num_partitions_in_current_dim > this->max_num_part_along_dim){
2116  std::cerr << "ERROR: maxPartNo calculation is wrong." << std::endl;
2117  exit(1);
2118  }
2119  //add this number to num_partitioning_in_current_dim vector.
2120  num_partitioning_in_current_dim.push_back(num_partitions_in_current_dim);
2121 
2122 
2123  //increase the output number of parts.
2124  output_num_parts += num_partitions_in_current_dim;
2125 
2126  //ideal number of future partitions for each part.
2127  mj_part_t ideal_num_future_parts_in_part = future_num_parts_of_part_ii / num_partitions_in_current_dim;
2128  for (mj_part_t iii = 0; iii < num_partitions_in_current_dim; ++iii){
2129  mj_part_t num_future_parts_for_part_iii = ideal_num_future_parts_in_part;
2130 
2131  //if there is a remainder in the part increase the part weight.
2132  if (iii < future_num_parts_of_part_ii % num_partitions_in_current_dim){
2133  //if not uniform, add 1 for the extra parts.
2134  ++num_future_parts_for_part_iii;
2135  }
2136  next_future_num_parts_in_parts->push_back(num_future_parts_for_part_iii);
2137 
2138  //if part boxes are stored, initialize the box of the parts as the ancestor.
2139  if (this->mj_keep_part_boxes){
2140  output_part_boxes->push_back((*input_part_boxes)[ii]);
2141  }
2142 
2143  //set num future_num_parts to maximum in this part.
2144  if (num_future_parts_for_part_iii > future_num_parts) future_num_parts = num_future_parts_for_part_iii;
2145  }
2146  }
2147  }
2148  return output_num_parts;
2149 }
2150 
2151 
2152 /* \brief Allocates and initializes the work memory that will be used by MJ.
2153  *
2154  * */
2155 template <typename mj_scalar_t, typename mj_lno_t, typename mj_gno_t,
2156  typename mj_part_t>
2157 void AlgMJ<mj_scalar_t, mj_lno_t, mj_gno_t, mj_part_t>::allocate_set_work_memory(){
2158 
2159  //points to process that initially owns the coordinate.
2160  this->owner_of_coordinate = NULL;
2161 
2162  //Throughout the partitioning execution,
2163  //instead of the moving the coordinates, hold a permutation array for parts.
2164  //coordinate_permutations holds the current permutation.
2165  this->coordinate_permutations = allocMemory< mj_lno_t>(this->num_local_coords);
2166  //initial configuration, set each pointer-i to i.
2167 #ifdef HAVE_ZOLTAN2_OMP
2168 #pragma omp parallel for
2169 #endif
2170  for(mj_lno_t i = 0; i < this->num_local_coords; ++i){
2171  this->coordinate_permutations[i] = i;
2172  }
2173 
2174  //new_coordinate_permutations holds the current permutation.
2175  this->new_coordinate_permutations = allocMemory< mj_lno_t>(this->num_local_coords);
2176 
2177  this->assigned_part_ids = NULL;
2178  if(this->num_local_coords > 0){
2179  this->assigned_part_ids = allocMemory<mj_part_t>(this->num_local_coords);
2180  }
2181 
2182  //single partition starts at index-0, and ends at numLocalCoords
2183  //inTotalCounts array holds the end points in coordinate_permutations array
2184  //for each partition. Initially sized 1, and single element is set to numLocalCoords.
2185  this->part_xadj = allocMemory<mj_lno_t>(1);
2186  this->part_xadj[0] = static_cast<mj_lno_t>(this->num_local_coords);//the end of the initial partition is the end of coordinates.
2187  //the ends points of the output, this is allocated later.
2188  this->new_part_xadj = NULL;
2189 
2190  // only store this much if cuts are needed to be stored.
2191  //this->all_cut_coordinates = allocMemory< mj_scalar_t>(this->total_num_cut);
2192 
2193 
2194  this->all_cut_coordinates = allocMemory< mj_scalar_t>(this->max_num_cut_along_dim * this->max_concurrent_part_calculation);
2195 
2196  this->max_min_coords = allocMemory< mj_scalar_t>(this->num_threads * 2);
2197 
2198  this->process_cut_line_weight_to_put_left = NULL; //how much weight percentage should a MPI put left side of the each cutline
2199  this->thread_cut_line_weight_to_put_left = NULL; //how much weight percentage should each thread in MPI put left side of the each outline
2200  //distribute_points_on_cut_lines = false;
2201  if(this->distribute_points_on_cut_lines){
2202  this->process_cut_line_weight_to_put_left = allocMemory<mj_scalar_t>(this->max_num_cut_along_dim * this->max_concurrent_part_calculation);
2203  this->thread_cut_line_weight_to_put_left = allocMemory<mj_scalar_t *>(this->num_threads);
2204  for(int i = 0; i < this->num_threads; ++i){
2205  this->thread_cut_line_weight_to_put_left[i] = allocMemory<mj_scalar_t>(this->max_num_cut_along_dim);
2206  }
2207  this->process_rectilinear_cut_weight = allocMemory<mj_scalar_t>(this->max_num_cut_along_dim);
2208  this->global_rectilinear_cut_weight = allocMemory<mj_scalar_t>(this->max_num_cut_along_dim);
2209  }
2210 
2211 
2212  // work array to manipulate coordinate of cutlines in different iterations.
2213  //necessary because previous cut line information is used for determining
2214  //the next cutline information. therefore, cannot update the cut work array
2215  //until all cutlines are determined.
2216  this->cut_coordinates_work_array = allocMemory<mj_scalar_t>(this->max_num_cut_along_dim *
2217  this->max_concurrent_part_calculation);
2218 
2219 
2220  //cumulative part weight array.
2221  this->target_part_weights = allocMemory<mj_scalar_t>(
2222  this->max_num_part_along_dim * this->max_concurrent_part_calculation);
2223  // the weight from left to write.
2224 
2225  this->cut_upper_bound_coordinates = allocMemory<mj_scalar_t>(this->max_num_cut_along_dim * this->max_concurrent_part_calculation); //upper bound coordinate of a cut line
2226  this->cut_lower_bound_coordinates = allocMemory<mj_scalar_t>(this->max_num_cut_along_dim* this->max_concurrent_part_calculation); //lower bound coordinate of a cut line
2227  this->cut_lower_bound_weights = allocMemory<mj_scalar_t>(this->max_num_cut_along_dim* this->max_concurrent_part_calculation); //lower bound weight of a cut line
2228  this->cut_upper_bound_weights = allocMemory<mj_scalar_t>(this->max_num_cut_along_dim* this->max_concurrent_part_calculation); //upper bound weight of a cut line
2229 
2230  this->process_local_min_max_coord_total_weight = allocMemory<mj_scalar_t>(3 * this->max_concurrent_part_calculation); //combined array to exchange the min and max coordinate, and total weight of part.
2231  this->global_min_max_coord_total_weight = allocMemory<mj_scalar_t>(3 * this->max_concurrent_part_calculation);//global combined array with the results for min, max and total weight.
2232 
2233  //is_cut_line_determined is used to determine if a cutline is determined already.
2234  //If a cut line is already determined, the next iterations will skip this cut line.
2235  this->is_cut_line_determined = allocMemory<bool>(this->max_num_cut_along_dim * this->max_concurrent_part_calculation);
2236  //my_incomplete_cut_count count holds the number of cutlines that have not been finalized for each part
2237  //when concurrentPartCount>1, using this information, if my_incomplete_cut_count[x]==0, then no work is done for this part.
2238  this->my_incomplete_cut_count = allocMemory<mj_part_t>(this->max_concurrent_part_calculation);
2239  //local part weights of each thread.
2240  this->thread_part_weights = allocMemory<double *>(this->num_threads);
2241  //the work manupulation array for partweights.
2242  this->thread_part_weight_work = allocMemory<double *>(this->num_threads);
2243 
2244  //thread_cut_left_closest_point to hold the closest coordinate to a cutline from left (for each thread).
2245  this->thread_cut_left_closest_point = allocMemory<mj_scalar_t *>(this->num_threads);
2246  //thread_cut_right_closest_point to hold the closest coordinate to a cutline from right (for each thread)
2247  this->thread_cut_right_closest_point = allocMemory<mj_scalar_t *>(this->num_threads);
2248 
2249  //to store how many points in each part a thread has.
2250  this->thread_point_counts = allocMemory<mj_lno_t *>(this->num_threads);
2251 
2252  for(int i = 0; i < this->num_threads; ++i){
2253  //partWeights[i] = allocMemory<mj_scalar_t>(maxTotalPartCount);
2254  this->thread_part_weights[i] = allocMemory < double >(this->max_num_total_part_along_dim * this->max_concurrent_part_calculation);
2255  this->thread_cut_right_closest_point[i] = allocMemory<mj_scalar_t>(this->max_num_cut_along_dim * this->max_concurrent_part_calculation);
2256  this->thread_cut_left_closest_point[i] = allocMemory<mj_scalar_t>(this->max_num_cut_along_dim * this->max_concurrent_part_calculation);
2257  this->thread_point_counts[i] = allocMemory<mj_lno_t>(this->max_num_part_along_dim);
2258  }
2259  //for faster communication, concatanation of
2260  //totalPartWeights sized 2P-1, since there are P parts and P-1 cut lines
2261  //leftClosest distances sized P-1, since P-1 cut lines
2262  //rightClosest distances size P-1, since P-1 cut lines.
2263  this->total_part_weight_left_right_closests = allocMemory<mj_scalar_t>((this->max_num_total_part_along_dim + this->max_num_cut_along_dim * 2) * this->max_concurrent_part_calculation);
2264  this->global_total_part_weight_left_right_closests = allocMemory<mj_scalar_t>((this->max_num_total_part_along_dim + this->max_num_cut_along_dim * 2) * this->max_concurrent_part_calculation);
2265 
2266 
2267  mj_scalar_t **coord = allocMemory<mj_scalar_t *>(this->coord_dim);
2268  for (int i=0; i < this->coord_dim; i++){
2269  coord[i] = allocMemory<mj_scalar_t>(this->num_local_coords);
2270 #ifdef HAVE_ZOLTAN2_OMP
2271 #pragma omp parallel for
2272 #endif
2273  for (mj_lno_t j=0; j < this->num_local_coords; j++)
2274  coord[i][j] = this->mj_coordinates[i][j];
2275  }
2276  this->mj_coordinates = coord;
2277 
2278 
2279  int criteria_dim = (this->num_weights_per_coord ? this->num_weights_per_coord : 1);
2280  mj_scalar_t **weights = allocMemory<mj_scalar_t *>(criteria_dim);
2281 
2282  for (int i=0; i < criteria_dim; i++){
2283  weights[i] = NULL;
2284  }
2285  for (int i=0; i < this->num_weights_per_coord; i++){
2286  weights[i] = allocMemory<mj_scalar_t>(this->num_local_coords);
2287 #ifdef HAVE_ZOLTAN2_OMP
2288 #pragma omp parallel for
2289 #endif
2290  for (mj_lno_t j=0; j < this->num_local_coords; j++)
2291  weights[i][j] = this->mj_weights[i][j];
2292 
2293  }
2294  this->mj_weights = weights;
2295  this->current_mj_gnos = allocMemory<mj_gno_t>(this->num_local_coords);
2296 #ifdef HAVE_ZOLTAN2_OMP
2297 #pragma omp parallel for
2298 #endif
2299  for (mj_lno_t j=0; j < this->num_local_coords; j++)
2300  this->current_mj_gnos[j] = this->initial_mj_gnos[j];
2301 
2302  this->owner_of_coordinate = allocMemory<int>(this->num_local_coords);
2303 
2304 #ifdef HAVE_ZOLTAN2_OMP
2305 #pragma omp parallel for
2306 #endif
2307  for (mj_lno_t j=0; j < this->num_local_coords; j++)
2308  this->owner_of_coordinate[j] = this->myActualRank;
2309 }
2310 
2311 /* \brief compute the global bounding box
2312  */
2313 template <typename mj_scalar_t, typename mj_lno_t, typename mj_gno_t,
2314  typename mj_part_t>
2315 void AlgMJ<mj_scalar_t,mj_lno_t,mj_gno_t,mj_part_t>::compute_global_box()
2316 {
2317  //local min coords
2318  mj_scalar_t *mins = allocMemory<mj_scalar_t>(this->coord_dim);
2319  //global min coords
2320  mj_scalar_t *gmins = allocMemory<mj_scalar_t>(this->coord_dim);
2321  //local max coords
2322  mj_scalar_t *maxs = allocMemory<mj_scalar_t>(this->coord_dim);
2323  //global max coords
2324  mj_scalar_t *gmaxs = allocMemory<mj_scalar_t>(this->coord_dim);
2325 
2326  for (int i = 0; i < this->coord_dim; ++i){
2327  mj_scalar_t localMin = std::numeric_limits<mj_scalar_t>::max();
2328  mj_scalar_t localMax = -localMin;
2329  if (localMax > 0) localMax = 0;
2330 
2331 
2332  for (mj_lno_t j = 0; j < this->num_local_coords; ++j){
2333  if (this->mj_coordinates[i][j] < localMin){
2334  localMin = this->mj_coordinates[i][j];
2335  }
2336  if (this->mj_coordinates[i][j] > localMax){
2337  localMax = this->mj_coordinates[i][j];
2338  }
2339  }
2340  //cout << " localMin:" << localMin << endl;
2341  //cout << " localMax:" << localMax << endl;
2342  mins[i] = localMin;
2343  maxs[i] = localMax;
2344 
2345  }
2346  reduceAll<int, mj_scalar_t>(*this->comm, Teuchos::REDUCE_MIN,
2347  this->coord_dim, mins, gmins
2348  );
2349 
2350 
2351  reduceAll<int, mj_scalar_t>(*this->comm, Teuchos::REDUCE_MAX,
2352  this->coord_dim, maxs, gmaxs
2353  );
2354 
2355 
2356 
2357  //create single box with all areas.
2358  global_box = rcp(new mj_partBox_t(0,this->coord_dim,gmins,gmaxs));
2359  //coordinateModelPartBox <mj_scalar_t, mj_part_t> tmpBox (0, coordDim);
2360  freeArray<mj_scalar_t>(mins);
2361  freeArray<mj_scalar_t>(gmins);
2362  freeArray<mj_scalar_t>(maxs);
2363  freeArray<mj_scalar_t>(gmaxs);
2364 }
2365 
2366 /* \brief for part communication we keep track of the box boundaries.
2367  * This is performed when either asked specifically, or when geometric mapping is performed afterwards.
2368  * This function initializes a single box with all global min and max coordinates.
2369  * \param initial_partitioning_boxes the input and output vector for boxes.
2370  */
2371 template <typename mj_scalar_t, typename mj_lno_t, typename mj_gno_t,
2372  typename mj_part_t>
2373 void AlgMJ<mj_scalar_t, mj_lno_t, mj_gno_t, mj_part_t>::init_part_boxes(
2374  RCP<mj_partBoxVector_t> & initial_partitioning_boxes
2375 )
2376 {
2377  mj_partBox_t tmp_box(*global_box);
2378  initial_partitioning_boxes->push_back(tmp_box);
2379 }
2380 
2391 template <typename mj_scalar_t, typename mj_lno_t, typename mj_gno_t,
2392  typename mj_part_t>
2393 void AlgMJ<mj_scalar_t, mj_lno_t, mj_gno_t, mj_part_t>::mj_get_local_min_max_coord_totW(
2394  mj_lno_t coordinate_begin_index,
2395  mj_lno_t coordinate_end_index,
2396  mj_lno_t *mj_current_coordinate_permutations,
2397  mj_scalar_t *mj_current_dim_coords,
2398  mj_scalar_t &min_coordinate,
2399  mj_scalar_t &max_coordinate,
2400  mj_scalar_t &total_weight){
2401 
2402  //if the part is empty.
2403  //set the min and max coordinates as reverse.
2404  if(coordinate_begin_index >= coordinate_end_index)
2405  {
2406  min_coordinate = this->maxScalar_t;
2407  max_coordinate = this->minScalar_t;
2408  total_weight = 0;
2409  }
2410  else {
2411  mj_scalar_t my_total_weight = 0;
2412 #ifdef HAVE_ZOLTAN2_OMP
2413 #pragma omp parallel
2414 #endif
2415  {
2416  //if uniform weights are used, then weight is equal to count.
2417  if (this->mj_uniform_weights[0]) {
2418 #ifdef HAVE_ZOLTAN2_OMP
2419 #pragma omp single
2420 #endif
2421  {
2422  my_total_weight = coordinate_end_index - coordinate_begin_index;
2423  }
2424 
2425  }
2426  else {
2427  //if not uniform, then weights are reducted from threads.
2428 #ifdef HAVE_ZOLTAN2_OMP
2429 #pragma omp for reduction(+:my_total_weight)
2430 #endif
2431  for (mj_lno_t ii = coordinate_begin_index; ii < coordinate_end_index; ++ii){
2432  int i = mj_current_coordinate_permutations[ii];
2433  my_total_weight += this->mj_weights[0][i];
2434  }
2435  }
2436 
2437  int my_thread_id = 0;
2438 #ifdef HAVE_ZOLTAN2_OMP
2439  my_thread_id = omp_get_thread_num();
2440 #endif
2441  mj_scalar_t my_thread_min_coord, my_thread_max_coord;
2442  my_thread_min_coord=my_thread_max_coord
2443  =mj_current_dim_coords[mj_current_coordinate_permutations[coordinate_begin_index]];
2444 
2445 
2446 #ifdef HAVE_ZOLTAN2_OMP
2447 #pragma omp for
2448 #endif
2449  for(mj_lno_t j = coordinate_begin_index + 1; j < coordinate_end_index; ++j){
2450  int i = mj_current_coordinate_permutations[j];
2451  if(mj_current_dim_coords[i] > my_thread_max_coord)
2452  my_thread_max_coord = mj_current_dim_coords[i];
2453  if(mj_current_dim_coords[i] < my_thread_min_coord)
2454  my_thread_min_coord = mj_current_dim_coords[i];
2455  }
2456  this->max_min_coords[my_thread_id] = my_thread_min_coord;
2457  this->max_min_coords[my_thread_id + this->num_threads] = my_thread_max_coord;
2458 
2459 #ifdef HAVE_ZOLTAN2_OMP
2460 //we need a barrier here, because max_min_array might not be filled by some of the threads.
2461 #pragma omp barrier
2462 #pragma omp single nowait
2463 #endif
2464  {
2465  min_coordinate = this->max_min_coords[0];
2466  for(int i = 1; i < this->num_threads; ++i){
2467  if(this->max_min_coords[i] < min_coordinate)
2468  min_coordinate = this->max_min_coords[i];
2469  }
2470  }
2471 
2472 #ifdef HAVE_ZOLTAN2_OMP
2473 #pragma omp single nowait
2474 #endif
2475  {
2476  max_coordinate = this->max_min_coords[this->num_threads];
2477  for(int i = this->num_threads + 1; i < this->num_threads * 2; ++i){
2478  if(this->max_min_coords[i] > max_coordinate)
2479  max_coordinate = this->max_min_coords[i];
2480  }
2481  }
2482  }
2483  total_weight = my_total_weight;
2484  }
2485 }
2486 
2487 
2495 template <typename mj_scalar_t, typename mj_lno_t, typename mj_gno_t,
2496  typename mj_part_t>
2497 void AlgMJ<mj_scalar_t, mj_lno_t, mj_gno_t, mj_part_t>::mj_get_global_min_max_coord_totW(
2498  mj_part_t current_concurrent_num_parts,
2499  mj_scalar_t *local_min_max_total,
2500  mj_scalar_t *global_min_max_total){
2501 
2502  //reduce min for first current_concurrent_num_parts elements, reduce max for next
2503  //concurrentPartCount elements,
2504  //reduce sum for the last concurrentPartCount elements.
2505  if(this->comm->getSize() > 1){
2507  reductionOp(
2508  current_concurrent_num_parts,
2509  current_concurrent_num_parts,
2510  current_concurrent_num_parts);
2511  try{
2512  reduceAll<int, mj_scalar_t>(
2513  *(this->comm),
2514  reductionOp,
2515  3 * current_concurrent_num_parts,
2516  local_min_max_total,
2517  global_min_max_total);
2518  }
2519  Z2_THROW_OUTSIDE_ERROR(*(this->mj_env))
2520  }
2521  else {
2522  mj_part_t s = 3 * current_concurrent_num_parts;
2523  for (mj_part_t i = 0; i < s; ++i){
2524  global_min_max_total[i] = local_min_max_total[i];
2525  }
2526  }
2527 }
2528 
2529 
2530 
2549 template <typename mj_scalar_t, typename mj_lno_t, typename mj_gno_t,
2550  typename mj_part_t>
2551 void AlgMJ<mj_scalar_t, mj_lno_t, mj_gno_t, mj_part_t>::mj_get_initial_cut_coords_target_weights(
2552  mj_scalar_t min_coord,
2553  mj_scalar_t max_coord,
2554  mj_part_t num_cuts/*p-1*/ ,
2555  mj_scalar_t global_weight,
2556  mj_scalar_t *initial_cut_coords /*p - 1 sized, coordinate of each cut line*/,
2557  mj_scalar_t *current_target_part_weights /*cumulative weights, at left side of each cut line. p-1 sized*/,
2558 
2559  std::vector <mj_part_t> *future_num_part_in_parts, //the vecto
2560  std::vector <mj_part_t> *next_future_num_parts_in_parts,
2561  mj_part_t concurrent_current_part,
2562  mj_part_t obtained_part_index
2563 ){
2564 
2565  mj_scalar_t coord_range = max_coord - min_coord;
2566  if(this->mj_uniform_parts[0]){
2567  {
2568  mj_part_t cumulative = 0;
2569  //how many total future parts the part will be partitioned into.
2570  mj_scalar_t total_future_part_count_in_part = mj_scalar_t((*future_num_part_in_parts)[concurrent_current_part]);
2571 
2572 
2573  //how much each part should weigh in ideal case.
2574  mj_scalar_t unit_part_weight = global_weight / total_future_part_count_in_part;
2575  /*
2576  cout << "total_future_part_count_in_part:" << total_future_part_count_in_part << endl;
2577  cout << "global_weight:" << global_weight << endl;
2578  cout << "unit_part_weight" << unit_part_weight <<endl;
2579  */
2580  for(mj_part_t i = 0; i < num_cuts; ++i){
2581  cumulative += (*next_future_num_parts_in_parts)[i + obtained_part_index];
2582 
2583  /*
2584  cout << "obtained_part_index:" << obtained_part_index <<
2585  " (*next_future_num_parts_in_parts)[i + obtained_part_index]:" << (*next_future_num_parts_in_parts)[i + obtained_part_index] <<
2586  " cumulative:" << cumulative << endl;
2587  */
2588  //set target part weight.
2589  current_target_part_weights[i] = cumulative * unit_part_weight;
2590  //cout <<"i:" << i << " current_target_part_weights:" << current_target_part_weights[i] << endl;
2591  //set initial cut coordinate.
2592  initial_cut_coords[i] = min_coord + (coord_range *
2593  cumulative) / total_future_part_count_in_part;
2594  }
2595  current_target_part_weights[num_cuts] = 1;
2596  }
2597 
2598  //round the target part weights.
2599  if (this->mj_uniform_weights[0]){
2600  for(mj_part_t i = 0; i < num_cuts + 1; ++i){
2601  current_target_part_weights[i] = long(current_target_part_weights[i] + 0.5);
2602  }
2603  }
2604  }
2605  else {
2606  std::cerr << "MJ does not support non uniform part weights" << std::endl;
2607  exit(1);
2608  }
2609 }
2610 
2611 
2624 template <typename mj_scalar_t, typename mj_lno_t, typename mj_gno_t,
2625  typename mj_part_t>
2626 void AlgMJ<mj_scalar_t, mj_lno_t, mj_gno_t, mj_part_t>::set_initial_coordinate_parts(
2627  mj_scalar_t &max_coordinate,
2628  mj_scalar_t &min_coordinate,
2629  mj_part_t &concurrent_current_part_index,
2630  mj_lno_t coordinate_begin_index,
2631  mj_lno_t coordinate_end_index,
2632  mj_lno_t *mj_current_coordinate_permutations,
2633  mj_scalar_t *mj_current_dim_coords,
2634  mj_part_t *mj_part_ids,
2635  mj_part_t &partition_count
2636 ){
2637  mj_scalar_t coordinate_range = max_coordinate - min_coordinate;
2638 
2639  //if there is single point, or if all points are along a line.
2640  //set initial part to 0 for all.
2641  if(ZOLTAN2_ABS(coordinate_range) < this->sEpsilon ){
2642 #ifdef HAVE_ZOLTAN2_OMP
2643 #pragma omp parallel for
2644 #endif
2645  for(mj_lno_t ii = coordinate_begin_index; ii < coordinate_end_index; ++ii){
2646  mj_part_ids[mj_current_coordinate_permutations[ii]] = 0;
2647  }
2648  }
2649  else{
2650 
2651  //otherwise estimate an initial part for each coordinate.
2652  //assuming uniform distribution of points.
2653  mj_scalar_t slice = coordinate_range / partition_count;
2654 
2655 #ifdef HAVE_ZOLTAN2_OMP
2656 #pragma omp parallel for
2657 #endif
2658  for(mj_lno_t ii = coordinate_begin_index; ii < coordinate_end_index; ++ii){
2659 
2660  mj_lno_t iii = mj_current_coordinate_permutations[ii];
2661  mj_part_t pp = mj_part_t((mj_current_dim_coords[iii] - min_coordinate) / slice);
2662  mj_part_ids[iii] = 2 * pp;
2663  }
2664  }
2665 }
2666 
2667 
2678 template <typename mj_scalar_t, typename mj_lno_t, typename mj_gno_t,
2679  typename mj_part_t>
2680 void AlgMJ<mj_scalar_t, mj_lno_t, mj_gno_t, mj_part_t>::mj_1D_part(
2681  mj_scalar_t *mj_current_dim_coords,
2682  mj_scalar_t used_imbalance_tolerance,
2683  mj_part_t current_work_part,
2684  mj_part_t current_concurrent_num_parts,
2685  mj_scalar_t *current_cut_coordinates,
2686  mj_part_t total_incomplete_cut_count,
2687  std::vector <mj_part_t> &num_partitioning_in_current_dim
2688 ){
2689 
2690 
2691  mj_part_t rectilinear_cut_count = 0;
2692  mj_scalar_t *temp_cut_coords = current_cut_coordinates;
2693 
2695  *reductionOp = NULL;
2696  reductionOp = new Teuchos::MultiJaggedCombinedReductionOp
2697  <mj_part_t, mj_scalar_t>(
2698  &num_partitioning_in_current_dim ,
2699  current_work_part ,
2700  current_concurrent_num_parts);
2701 
2702  size_t total_reduction_size = 0;
2703 #ifdef HAVE_ZOLTAN2_OMP
2704 #pragma omp parallel shared(total_incomplete_cut_count, rectilinear_cut_count)
2705 #endif
2706  {
2707  int me = 0;
2708 #ifdef HAVE_ZOLTAN2_OMP
2709  me = omp_get_thread_num();
2710 #endif
2711  double *my_thread_part_weights = this->thread_part_weights[me];
2712  mj_scalar_t *my_thread_left_closest = this->thread_cut_left_closest_point[me];
2713  mj_scalar_t *my_thread_right_closest = this->thread_cut_right_closest_point[me];
2714 
2715 #ifdef HAVE_ZOLTAN2_OMP
2716 #pragma omp single
2717 #endif
2718  {
2719  //initialize the lower and upper bounds of the cuts.
2720  mj_part_t next = 0;
2721  for(mj_part_t i = 0; i < current_concurrent_num_parts; ++i){
2722 
2723  mj_part_t num_part_in_dim = num_partitioning_in_current_dim[current_work_part + i];
2724  mj_part_t num_cut_in_dim = num_part_in_dim - 1;
2725  total_reduction_size += (4 * num_cut_in_dim + 1);
2726 
2727  for(mj_part_t ii = 0; ii < num_cut_in_dim; ++ii){
2728  this->is_cut_line_determined[next] = false;
2729  this->cut_lower_bound_coordinates[next] = global_min_max_coord_total_weight[i]; //min coordinate
2730  this->cut_upper_bound_coordinates[next] = global_min_max_coord_total_weight[i + current_concurrent_num_parts]; //max coordinate
2731 
2732  this->cut_upper_bound_weights[next] = global_min_max_coord_total_weight[i + 2 * current_concurrent_num_parts]; //total weight
2733  this->cut_lower_bound_weights[next] = 0;
2734 
2735  if(this->distribute_points_on_cut_lines){
2736  this->process_cut_line_weight_to_put_left[next] = 0;
2737  }
2738  ++next;
2739  }
2740  }
2741  }
2742 
2743  //no need to have barrier here.
2744  //pragma omp single have implicit barrier.
2745 
2746  int iteration = 0;
2747  while (total_incomplete_cut_count != 0){
2748  iteration += 1;
2749  //cout << "\niteration:" << iteration << " ";
2750  mj_part_t concurrent_cut_shifts = 0;
2751  size_t total_part_shift = 0;
2752 
2753  for (mj_part_t kk = 0; kk < current_concurrent_num_parts; ++kk){
2754  mj_part_t num_parts = -1;
2755  num_parts = num_partitioning_in_current_dim[current_work_part + kk];
2756 
2757  mj_part_t num_cuts = num_parts - 1;
2758  size_t total_part_count = num_parts + size_t (num_cuts) ;
2759  if (this->my_incomplete_cut_count[kk] > 0){
2760 
2761  //although isDone shared, currentDone is private and same for all.
2762  bool *current_cut_status = this->is_cut_line_determined + concurrent_cut_shifts;
2763  double *my_current_part_weights = my_thread_part_weights + total_part_shift;
2764  mj_scalar_t *my_current_left_closest = my_thread_left_closest + concurrent_cut_shifts;
2765  mj_scalar_t *my_current_right_closest = my_thread_right_closest + concurrent_cut_shifts;
2766 
2767  mj_part_t conccurent_current_part = current_work_part + kk;
2768  mj_lno_t coordinate_begin_index = conccurent_current_part ==0 ? 0: this->part_xadj[conccurent_current_part -1];
2769  mj_lno_t coordinate_end_index = this->part_xadj[conccurent_current_part];
2770  mj_scalar_t *temp_current_cut_coords = temp_cut_coords + concurrent_cut_shifts;
2771 
2772  mj_scalar_t min_coord = global_min_max_coord_total_weight[kk];
2773  mj_scalar_t max_coord = global_min_max_coord_total_weight[kk + current_concurrent_num_parts];
2774 
2775  // compute part weights using existing cuts
2776  this->mj_1D_part_get_thread_part_weights(
2777  total_part_count,
2778  num_cuts,
2779  max_coord,//globalMinMaxTotal[kk + concurrentPartCount],//maxScalar,
2780  min_coord,//globalMinMaxTotal[kk]//minScalar,
2781  coordinate_begin_index,
2782  coordinate_end_index,
2783  mj_current_dim_coords,
2784  temp_current_cut_coords,
2785  current_cut_status,
2786  my_current_part_weights,
2787  my_current_left_closest,
2788  my_current_right_closest);
2789 
2790  }
2791 
2792  concurrent_cut_shifts += num_cuts;
2793  total_part_shift += total_part_count;
2794  }
2795 
2796  //sum up the results of threads
2797  this->mj_accumulate_thread_results(
2798  num_partitioning_in_current_dim,
2799  current_work_part,
2800  current_concurrent_num_parts);
2801 
2802  //now sum up the results of mpi processors.
2803 #ifdef HAVE_ZOLTAN2_OMP
2804 #pragma omp single
2805 #endif
2806  {
2807  if(this->comm->getSize() > 1){
2808  reduceAll<int, mj_scalar_t>( *(this->comm), *reductionOp,
2809  total_reduction_size,
2810  this->total_part_weight_left_right_closests,
2811  this->global_total_part_weight_left_right_closests);
2812 
2813  }
2814  else {
2815  memcpy(
2816  this->global_total_part_weight_left_right_closests,
2817  this->total_part_weight_left_right_closests,
2818  total_reduction_size * sizeof(mj_scalar_t));
2819  }
2820  }
2821 
2822  //how much cut will be shifted for the next part in the concurrent part calculation.
2823  mj_part_t cut_shift = 0;
2824 
2825  //how much the concantaneted array will be shifted for the next part in concurrent part calculation.
2826  size_t tlr_shift = 0;
2827  for (mj_part_t kk = 0; kk < current_concurrent_num_parts; ++kk){
2828  mj_part_t num_parts = num_partitioning_in_current_dim[current_work_part + kk];
2829  mj_part_t num_cuts = num_parts - 1;
2830  size_t num_total_part = num_parts + size_t (num_cuts) ;
2831 
2832  //if the cuts of this cut has already been completed.
2833  //nothing to do for this part.
2834  //just update the shift amount and proceed.
2835  if (this->my_incomplete_cut_count[kk] == 0) {
2836  cut_shift += num_cuts;
2837  tlr_shift += (num_total_part + 2 * num_cuts);
2838  continue;
2839  }
2840 
2841  mj_scalar_t *current_local_part_weights = this->total_part_weight_left_right_closests + tlr_shift ;
2842  mj_scalar_t *current_global_tlr = this->global_total_part_weight_left_right_closests + tlr_shift;
2843  mj_scalar_t *current_global_left_closest_points = current_global_tlr + num_total_part; //left closest points
2844  mj_scalar_t *current_global_right_closest_points = current_global_tlr + num_total_part + num_cuts; //right closest points
2845  mj_scalar_t *current_global_part_weights = current_global_tlr;
2846  bool *current_cut_line_determined = this->is_cut_line_determined + cut_shift;
2847 
2848  mj_scalar_t *current_part_target_weights = this->target_part_weights + cut_shift + kk;
2849  mj_scalar_t *current_part_cut_line_weight_to_put_left = this->process_cut_line_weight_to_put_left + cut_shift;
2850 
2851  mj_scalar_t min_coordinate = global_min_max_coord_total_weight[kk];
2852  mj_scalar_t max_coordinate = global_min_max_coord_total_weight[kk + current_concurrent_num_parts];
2853  mj_scalar_t global_total_weight = global_min_max_coord_total_weight[kk + current_concurrent_num_parts * 2];
2854  mj_scalar_t *current_cut_lower_bound_weights = this->cut_lower_bound_weights + cut_shift;
2855  mj_scalar_t *current_cut_upper_weights = this->cut_upper_bound_weights + cut_shift;
2856  mj_scalar_t *current_cut_upper_bounds = this->cut_upper_bound_coordinates + cut_shift;
2857  mj_scalar_t *current_cut_lower_bounds = this->cut_lower_bound_coordinates + cut_shift;
2858 
2859  mj_part_t initial_incomplete_cut_count = this->my_incomplete_cut_count[kk];
2860 
2861  // Now compute the new cut coordinates.
2862  this->mj_get_new_cut_coordinates(
2863  num_total_part,
2864  num_cuts,
2865  max_coordinate,
2866  min_coordinate,
2867  global_total_weight,
2868  used_imbalance_tolerance,
2869  current_global_part_weights,
2870  current_local_part_weights,
2871  current_part_target_weights,
2872  current_cut_line_determined,
2873  temp_cut_coords + cut_shift,
2874  current_cut_upper_bounds,
2875  current_cut_lower_bounds,
2876  current_global_left_closest_points,
2877  current_global_right_closest_points,
2878  current_cut_lower_bound_weights,
2879  current_cut_upper_weights,
2880  this->cut_coordinates_work_array +cut_shift, //new cut coordinates
2881  current_part_cut_line_weight_to_put_left,
2882  &rectilinear_cut_count,
2883  this->my_incomplete_cut_count[kk]);
2884 
2885  cut_shift += num_cuts;
2886  tlr_shift += (num_total_part + 2 * num_cuts);
2887  mj_part_t iteration_complete_cut_count = initial_incomplete_cut_count - this->my_incomplete_cut_count[kk];
2888 #ifdef HAVE_ZOLTAN2_OMP
2889 #pragma omp single
2890 #endif
2891  {
2892  total_incomplete_cut_count -= iteration_complete_cut_count;
2893  }
2894 
2895  }
2896  { //This unnecessary bracket works around a compiler bug in NVCC when compiling with OpenMP enabled
2897 #ifdef HAVE_ZOLTAN2_OMP
2898 #pragma omp barrier
2899 #pragma omp single
2900 #endif
2901  {
2902  //swap the cut coordinates for next iteration.
2903  mj_scalar_t *t = temp_cut_coords;
2904  temp_cut_coords = this->cut_coordinates_work_array;
2905  this->cut_coordinates_work_array = t;
2906  }
2907  }
2908  }
2909 
2910  // Needed only if keep_cuts; otherwise can simply swap array pointers
2911  // cutCoordinates and cutCoordinatesWork.
2912  // (at first iteration, cutCoordinates == cutCoorindates_tmp).
2913  // computed cuts must be in cutCoordinates.
2914  if (current_cut_coordinates != temp_cut_coords){
2915 #ifdef HAVE_ZOLTAN2_OMP
2916 #pragma omp single
2917 #endif
2918  {
2919  mj_part_t next = 0;
2920  for(mj_part_t i = 0; i < current_concurrent_num_parts; ++i){
2921  mj_part_t num_parts = -1;
2922  num_parts = num_partitioning_in_current_dim[current_work_part + i];
2923  mj_part_t num_cuts = num_parts - 1;
2924 
2925  for(mj_part_t ii = 0; ii < num_cuts; ++ii){
2926  current_cut_coordinates[next + ii] = temp_cut_coords[next + ii];
2927  }
2928  next += num_cuts;
2929  }
2930  }
2931 
2932 #ifdef HAVE_ZOLTAN2_OMP
2933 #pragma omp single
2934 #endif
2935  {
2936  this->cut_coordinates_work_array = temp_cut_coords;
2937  }
2938  }
2939  }
2940  delete reductionOp;
2941 }
2942 
2943 
2963 template <typename mj_scalar_t, typename mj_lno_t, typename mj_gno_t,
2964  typename mj_part_t>
2965 void AlgMJ<mj_scalar_t, mj_lno_t, mj_gno_t, mj_part_t>::mj_1D_part_get_thread_part_weights(
2966  size_t total_part_count,
2967  mj_part_t num_cuts,
2968  mj_scalar_t max_coord,
2969  mj_scalar_t min_coord,
2970  mj_lno_t coordinate_begin_index,
2971  mj_lno_t coordinate_end_index,
2972  mj_scalar_t *mj_current_dim_coords,
2973  mj_scalar_t *temp_current_cut_coords,
2974  bool *current_cut_status,
2975  double *my_current_part_weights,
2976  mj_scalar_t *my_current_left_closest,
2977  mj_scalar_t *my_current_right_closest){
2978 
2979  // initializations for part weights, left/right closest
2980  for (size_t i = 0; i < total_part_count; ++i){
2981  my_current_part_weights[i] = 0;
2982  }
2983 
2984  //initialize the left and right closest coordinates
2985  //to their max value.
2986  for(mj_part_t i = 0; i < num_cuts; ++i){
2987  my_current_left_closest[i] = min_coord - 1;
2988  my_current_right_closest[i] = max_coord + 1;
2989  }
2990  //mj_lno_t comparison_count = 0;
2991  mj_scalar_t minus_EPSILON = -this->sEpsilon;
2992 #ifdef HAVE_ZOLTAN2_OMP
2993  //no need for the barrier as all threads uses their local memories.
2994  //dont change the static scheduling here, as it is assumed when the new
2995  //partitions are created later.
2996 #pragma omp for
2997 #endif
2998  for (mj_lno_t ii = coordinate_begin_index; ii < coordinate_end_index; ++ii){
2999  int i = this->coordinate_permutations[ii];
3000 
3001  //the accesses to assigned_part_ids are thread safe
3002  //since each coordinate is assigned to only a single thread.
3003  mj_part_t j = this->assigned_part_ids[i] / 2;
3004 
3005  if(j >= num_cuts){
3006  j = num_cuts - 1;
3007  }
3008 
3009  mj_part_t lower_cut_index = 0;
3010  mj_part_t upper_cut_index = num_cuts - 1;
3011 
3012  mj_scalar_t w = this->mj_uniform_weights[0]? 1:this->mj_weights[0][i];
3013  bool is_inserted = false;
3014  bool is_on_left_of_cut = false;
3015  bool is_on_right_of_cut = false;
3016  mj_part_t last_compared_part = -1;
3017 
3018  mj_scalar_t coord = mj_current_dim_coords[i];
3019 
3020  while(upper_cut_index >= lower_cut_index)
3021  {
3022  //comparison_count++;
3023  last_compared_part = -1;
3024  is_on_left_of_cut = false;
3025  is_on_right_of_cut = false;
3026  mj_scalar_t cut = temp_current_cut_coords[j];
3027  mj_scalar_t distance_to_cut = coord - cut;
3028  mj_scalar_t abs_distance_to_cut = ZOLTAN2_ABS(distance_to_cut);
3029 
3030  //if it is on the line.
3031  if(abs_distance_to_cut < this->sEpsilon){
3032 
3033  my_current_part_weights[j * 2 + 1] += w;
3034  this->assigned_part_ids[i] = j * 2 + 1;
3035 
3036  //assign left and right closest point to cut as the point is on the cut.
3037  my_current_left_closest[j] = coord;
3038  my_current_right_closest[j] = coord;
3039  //now we need to check if there are other cuts on the same cut coordinate.
3040  //if there are, then we add the weight of the cut to all cuts in the same coordinate.
3041  mj_part_t kk = j + 1;
3042  while(kk < num_cuts){
3043  // Needed when cuts shared the same position
3044  distance_to_cut =ZOLTAN2_ABS(temp_current_cut_coords[kk] - cut);
3045  if(distance_to_cut < this->sEpsilon){
3046  my_current_part_weights[2 * kk + 1] += w;
3047  my_current_left_closest[kk] = coord;
3048  my_current_right_closest[kk] = coord;
3049  kk++;
3050  }
3051  else{
3052  //cut is far away.
3053  //just check the left closest point for the next cut.
3054  if(coord - my_current_left_closest[kk] > this->sEpsilon){
3055  my_current_left_closest[kk] = coord;
3056  }
3057  break;
3058  }
3059  }
3060 
3061 
3062  kk = j - 1;
3063  //continue checking for the cuts on the left if they share the same coordinate.
3064  while(kk >= 0){
3065  distance_to_cut =ZOLTAN2_ABS(temp_current_cut_coords[kk] - cut);
3066  if(distance_to_cut < this->sEpsilon){
3067  my_current_part_weights[2 * kk + 1] += w;
3068  //try to write the partId as the leftmost cut.
3069  this->assigned_part_ids[i] = kk * 2 + 1;
3070  my_current_left_closest[kk] = coord;
3071  my_current_right_closest[kk] = coord;
3072  kk--;
3073  }
3074  else{
3075  //if cut is far away on the left of the point.
3076  //then just compare for right closest point.
3077  if(my_current_right_closest[kk] - coord > this->sEpsilon){
3078  my_current_right_closest[kk] = coord;
3079  }
3080  break;
3081  }
3082  }
3083 
3084  is_inserted = true;
3085  break;
3086  }
3087  else {
3088  //if point is on the left of the cut.
3089  if (distance_to_cut < 0) {
3090  bool _break = false;
3091  if(j > 0){
3092  //check distance to the cut on the left the current cut compared.
3093  //if point is on the right, then we find the part of the point.
3094  mj_scalar_t distance_to_next_cut = coord - temp_current_cut_coords[j - 1];
3095  if(distance_to_next_cut > this->sEpsilon){
3096  _break = true;
3097  }
3098  }
3099  //if point is not on the right of the next cut, then
3100  //set the upper bound to this cut.
3101  upper_cut_index = j - 1;
3102  //set the last part, and mark it as on the left of the last part.
3103  is_on_left_of_cut = true;
3104  last_compared_part = j;
3105  if(_break) break;
3106  }
3107  else {
3108  //if point is on the right of the cut.
3109  bool _break = false;
3110  if(j < num_cuts - 1){
3111  //check distance to the cut on the left the current cut compared.
3112  //if point is on the right, then we find the part of the point.
3113  mj_scalar_t distance_to_next_cut = coord - temp_current_cut_coords[j + 1];
3114  if(distance_to_next_cut < minus_EPSILON){
3115  _break = true;
3116  }
3117  }
3118 
3119  //if point is not on the left of the next cut, then
3120  //set the upper bound to this cut.
3121  lower_cut_index = j + 1;
3122  //set the last part, and mark it as on the right of the last part.
3123  is_on_right_of_cut = true;
3124  last_compared_part = j;
3125  if(_break) break;
3126  }
3127  }
3128 
3129  j = (upper_cut_index + lower_cut_index) / 2;
3130  }
3131  if(!is_inserted){
3132  if(is_on_right_of_cut){
3133 
3134  //add it to the right of the last compared part.
3135  my_current_part_weights[2 * last_compared_part + 2] += w;
3136  this->assigned_part_ids[i] = 2 * last_compared_part + 2;
3137 
3138  //update the right closest point of last compared cut.
3139  if(my_current_right_closest[last_compared_part] - coord > this->sEpsilon){
3140  my_current_right_closest[last_compared_part] = coord;
3141  }
3142  //update the left closest point of the cut on the right of the last compared cut.
3143  if(last_compared_part+1 < num_cuts){
3144 
3145  if(coord - my_current_left_closest[last_compared_part + 1] > this->sEpsilon){
3146  my_current_left_closest[last_compared_part + 1] = coord;
3147  }
3148  }
3149 
3150  }
3151  else if(is_on_left_of_cut){
3152 
3153  //add it to the left of the last compared part.
3154  my_current_part_weights[2 * last_compared_part] += w;
3155  this->assigned_part_ids[i] = 2 * last_compared_part;
3156 
3157 
3158  //update the left closest point of last compared cut.
3159  if(coord - my_current_left_closest[last_compared_part] > this->sEpsilon){
3160  my_current_left_closest[last_compared_part] = coord;
3161  }
3162 
3163  //update the right closest point of the cut on the left of the last compared cut.
3164  if(last_compared_part-1 >= 0){
3165  if(my_current_right_closest[last_compared_part -1] - coord > this->sEpsilon){
3166  my_current_right_closest[last_compared_part -1] = coord;
3167  }
3168  }
3169  }
3170  }
3171  }
3172 
3173  // prefix sum computation.
3174  //we need prefix sum for each part to determine cut positions.
3175  for (size_t i = 1; i < total_part_count; ++i){
3176  // check for cuts sharing the same position; all cuts sharing a position
3177  // have the same weight == total weight for all cuts sharing the position.
3178  // don't want to accumulate that total weight more than once.
3179  if(i % 2 == 0 && i > 1 && i < total_part_count - 1 &&
3180  ZOLTAN2_ABS(temp_current_cut_coords[i / 2] - temp_current_cut_coords[i /2 - 1])
3181  < this->sEpsilon){
3182  //i % 2 = 0 when part i represents the cut coordinate.
3183  //if it is a cut, and if the next cut also have the same coordinate, then
3184  //dont addup.
3185  my_current_part_weights[i] = my_current_part_weights[i-2];
3186  continue;
3187  }
3188  //otherwise do the prefix sum.
3189  my_current_part_weights[i] += my_current_part_weights[i-1];
3190  }
3191 }
3192 
3193 
3201 template <typename mj_scalar_t, typename mj_lno_t, typename mj_gno_t,
3202  typename mj_part_t>
3203 void AlgMJ<mj_scalar_t, mj_lno_t, mj_gno_t, mj_part_t>::mj_accumulate_thread_results(
3204  const std::vector <mj_part_t> &num_partitioning_in_current_dim,
3205  mj_part_t current_work_part,
3206  mj_part_t current_concurrent_num_parts){
3207 
3208 #ifdef HAVE_ZOLTAN2_OMP
3209  //needs barrier here, as it requires all threads to finish mj_1D_part_get_thread_part_weights
3210  //using parallel region here reduces the performance because of the cache invalidates.
3211 #pragma omp barrier
3212 #pragma omp single
3213 #endif
3214  {
3215  size_t tlr_array_shift = 0;
3216  mj_part_t cut_shift = 0;
3217 
3218  //iterate for all concurrent parts to find the left and right closest points in the process.
3219  for(mj_part_t i = 0; i < current_concurrent_num_parts; ++i){
3220 
3221  mj_part_t num_parts_in_part = num_partitioning_in_current_dim[current_work_part + i];
3222  mj_part_t num_cuts_in_part = num_parts_in_part - 1;
3223  size_t num_total_part_in_part = num_parts_in_part + size_t (num_cuts_in_part) ;
3224 
3225  //iterate for cuts in a single part.
3226  for(mj_part_t ii = 0; ii < num_cuts_in_part ; ++ii){
3227  mj_part_t next = tlr_array_shift + ii;
3228  mj_part_t cut_index = cut_shift + ii;
3229  if(this->is_cut_line_determined[cut_index]) continue;
3230  mj_scalar_t left_closest_in_process = this->thread_cut_left_closest_point[0][cut_index],
3231  right_closest_in_process = this->thread_cut_right_closest_point[0][cut_index];
3232 
3233  //find the closest points from left and right for the cut in the process.
3234  for (int j = 1; j < this->num_threads; ++j){
3235  if (this->thread_cut_right_closest_point[j][cut_index] < right_closest_in_process ){
3236  right_closest_in_process = this->thread_cut_right_closest_point[j][cut_index];
3237  }
3238  if (this->thread_cut_left_closest_point[j][cut_index] > left_closest_in_process ){
3239  left_closest_in_process = this->thread_cut_left_closest_point[j][cut_index];
3240  }
3241  }
3242  //store the left and right closes points.
3243  this->total_part_weight_left_right_closests[num_total_part_in_part +
3244  next] = left_closest_in_process;
3245  this->total_part_weight_left_right_closests[num_total_part_in_part +
3246  num_cuts_in_part + next] = right_closest_in_process;
3247  }
3248  //set the shift position in the arrays
3249  tlr_array_shift += (num_total_part_in_part + 2 * num_cuts_in_part);
3250  cut_shift += num_cuts_in_part;
3251  }
3252 
3253  tlr_array_shift = 0;
3254  cut_shift = 0;
3255  size_t total_part_array_shift = 0;
3256 
3257  //iterate for all concurrent parts to find the total weight in the process.
3258  for(mj_part_t i = 0; i < current_concurrent_num_parts; ++i){
3259 
3260  mj_part_t num_parts_in_part = num_partitioning_in_current_dim[current_work_part + i];
3261  mj_part_t num_cuts_in_part = num_parts_in_part - 1;
3262  size_t num_total_part_in_part = num_parts_in_part + size_t (num_cuts_in_part) ;
3263 
3264  for(size_t j = 0; j < num_total_part_in_part; ++j){
3265 
3266  mj_part_t cut_ind = j / 2 + cut_shift;
3267 
3268  //need to check j != num_total_part_in_part - 1
3269  // which is same as j/2 != num_cuts_in_part.
3270  //we cannot check it using cut_ind, because of the concurrent part concantanetion.
3271  if(j != num_total_part_in_part - 1 && this->is_cut_line_determined[cut_ind]) continue;
3272  double pwj = 0;
3273  for (int k = 0; k < this->num_threads; ++k){
3274  pwj += this->thread_part_weights[k][total_part_array_shift + j];
3275  }
3276  //size_t jshift = j % total_part_count + i * (total_part_count + 2 * noCuts);
3277  this->total_part_weight_left_right_closests[tlr_array_shift + j] = pwj;
3278  }
3279  cut_shift += num_cuts_in_part;
3280  tlr_array_shift += num_total_part_in_part + 2 * num_cuts_in_part;
3281  total_part_array_shift += num_total_part_in_part;
3282  }
3283  }
3284  //the other threads needs to wait here.
3285  //but we don't need a pragma omp barrier.
3286  //as omp single has already have implicit barrier.
3287 }
3288 
3289 
3299 template <typename mj_scalar_t, typename mj_lno_t, typename mj_gno_t,
3300  typename mj_part_t>
3301 void AlgMJ<mj_scalar_t, mj_lno_t, mj_gno_t, mj_part_t>::mj_calculate_new_cut_position (
3302  mj_scalar_t cut_upper_bound,
3303  mj_scalar_t cut_lower_bound,
3304  mj_scalar_t cut_upper_weight,
3305  mj_scalar_t cut_lower_weight,
3306  mj_scalar_t expected_weight,
3307  mj_scalar_t &new_cut_position){
3308 
3309  if(ZOLTAN2_ABS(cut_upper_bound - cut_lower_bound) < this->sEpsilon){
3310  new_cut_position = cut_upper_bound; //or lower bound does not matter.
3311  }
3312 
3313 
3314  if(ZOLTAN2_ABS(cut_upper_weight - cut_lower_weight) < this->sEpsilon){
3315  new_cut_position = cut_lower_bound;
3316  }
3317 
3318  mj_scalar_t coordinate_range = (cut_upper_bound - cut_lower_bound);
3319  mj_scalar_t weight_range = (cut_upper_weight - cut_lower_weight);
3320  mj_scalar_t my_weight_diff = (expected_weight - cut_lower_weight);
3321 
3322  mj_scalar_t required_shift = (my_weight_diff / weight_range);
3323  int scale_constant = 20;
3324  int shiftint= int (required_shift * scale_constant);
3325  if (shiftint == 0) shiftint = 1;
3326  required_shift = mj_scalar_t (shiftint) / scale_constant;
3327  new_cut_position = coordinate_range * required_shift + cut_lower_bound;
3328 }
3329 
3330 
3341 template <typename mj_scalar_t, typename mj_lno_t, typename mj_gno_t,
3342  typename mj_part_t>
3343 void AlgMJ<mj_scalar_t, mj_lno_t, mj_gno_t, mj_part_t>::mj_create_new_partitions(
3344  mj_part_t num_parts,
3345  mj_scalar_t *mj_current_dim_coords,
3346  mj_scalar_t *current_concurrent_cut_coordinate,
3347  mj_lno_t coordinate_begin,
3348  mj_lno_t coordinate_end,
3349  mj_scalar_t *used_local_cut_line_weight_to_left,
3350  double **used_thread_part_weight_work,
3351  mj_lno_t *out_part_xadj){
3352 
3353  mj_part_t num_cuts = num_parts - 1;
3354 
3355 #ifdef HAVE_ZOLTAN2_OMP
3356 #pragma omp parallel
3357 #endif
3358  {
3359  int me = 0;
3360 #ifdef HAVE_ZOLTAN2_OMP
3361  me = omp_get_thread_num();
3362 #endif
3363 
3364  mj_lno_t *thread_num_points_in_parts = this->thread_point_counts[me];
3365  mj_scalar_t *my_local_thread_cut_weights_to_put_left = NULL;
3366 
3367  //now if the rectilinear partitioning is allowed we decide how
3368  //much weight each thread should put to left and right.
3369  if (this->distribute_points_on_cut_lines){
3370  my_local_thread_cut_weights_to_put_left = this->thread_cut_line_weight_to_put_left[me];
3371  // this for assumes the static scheduling in mj_1D_part calculation.
3372 #ifdef HAVE_ZOLTAN2_OMP
3373 #pragma omp for
3374 #endif
3375  for (mj_part_t i = 0; i < num_cuts; ++i){
3376  //the left to be put on the left of the cut.
3377  mj_scalar_t left_weight = used_local_cut_line_weight_to_left[i];
3378  for(int ii = 0; ii < this->num_threads; ++ii){
3379  if(left_weight > this->sEpsilon){
3380  //the weight of thread ii on cut.
3381  mj_scalar_t thread_ii_weight_on_cut = used_thread_part_weight_work[ii][i * 2 + 1] - used_thread_part_weight_work[ii][i * 2 ];
3382  if(thread_ii_weight_on_cut < left_weight){
3383  //if left weight is bigger than threads weight on cut.
3384  this->thread_cut_line_weight_to_put_left[ii][i] = thread_ii_weight_on_cut;
3385  }
3386  else {
3387  //if thread's weight is bigger than space, then put only a portion.
3388  this->thread_cut_line_weight_to_put_left[ii][i] = left_weight ;
3389  }
3390  left_weight -= thread_ii_weight_on_cut;
3391  }
3392  else {
3393  this->thread_cut_line_weight_to_put_left[ii][i] = 0;
3394  }
3395  }
3396  }
3397 
3398  if(num_cuts > 0){
3399  //this is a special case. If cutlines share the same coordinate, their weights are equal.
3400  //we need to adjust the ratio for that.
3401  for (mj_part_t i = num_cuts - 1; i > 0 ; --i){
3402  if(ZOLTAN2_ABS(current_concurrent_cut_coordinate[i] - current_concurrent_cut_coordinate[i -1]) < this->sEpsilon){
3403  my_local_thread_cut_weights_to_put_left[i] -= my_local_thread_cut_weights_to_put_left[i - 1] ;
3404  }
3405  my_local_thread_cut_weights_to_put_left[i] = int ((my_local_thread_cut_weights_to_put_left[i] + LEAST_SIGNIFICANCE) * SIGNIFICANCE_MUL)
3406  / mj_scalar_t(SIGNIFICANCE_MUL);
3407  }
3408  }
3409  }
3410 
3411  for(mj_part_t ii = 0; ii < num_parts; ++ii){
3412  thread_num_points_in_parts[ii] = 0;
3413  }
3414 
3415 
3416 #ifdef HAVE_ZOLTAN2_OMP
3417  //dont change static scheduler. the static partitioner used later as well.
3418 #pragma omp for
3419 #endif
3420  for (mj_lno_t ii = coordinate_begin; ii < coordinate_end; ++ii){
3421 
3422  mj_lno_t coordinate_index = this->coordinate_permutations[ii];
3423  mj_scalar_t coordinate_weight = this->mj_uniform_weights[0]? 1:this->mj_weights[0][coordinate_index];
3424  mj_part_t coordinate_assigned_place = this->assigned_part_ids[coordinate_index];
3425  mj_part_t coordinate_assigned_part = coordinate_assigned_place / 2;
3426  if(coordinate_assigned_place % 2 == 1){
3427  //if it is on the cut.
3428  if(this->distribute_points_on_cut_lines
3429  && my_local_thread_cut_weights_to_put_left[coordinate_assigned_part] > this->sEpsilon){
3430  //if the rectilinear partitioning is allowed,
3431  //and the thread has still space to put on the left of the cut
3432  //then thread puts the vertex to left.
3433  my_local_thread_cut_weights_to_put_left[coordinate_assigned_part] -= coordinate_weight;
3434  //if putting the vertex to left increased the weight more than expected.
3435  //and if the next cut is on the same coordinate,
3436  //then we need to adjust how much weight next cut puts to its left as well,
3437  //in order to take care of the imbalance.
3438  if(my_local_thread_cut_weights_to_put_left[coordinate_assigned_part] < 0
3439  && coordinate_assigned_part < num_cuts - 1
3440  && ZOLTAN2_ABS(current_concurrent_cut_coordinate[coordinate_assigned_part+1] -
3441  current_concurrent_cut_coordinate[coordinate_assigned_part]) < this->sEpsilon){
3442  my_local_thread_cut_weights_to_put_left[coordinate_assigned_part + 1] += my_local_thread_cut_weights_to_put_left[coordinate_assigned_part];
3443  }
3444  ++thread_num_points_in_parts[coordinate_assigned_part];
3445  this->assigned_part_ids[coordinate_index] = coordinate_assigned_part;
3446  }
3447  else{
3448  //if there is no more space on the left, put the coordinate to the right of the cut.
3449  ++coordinate_assigned_part;
3450  //this while loop is necessary when a line is partitioned into more than 2 parts.
3451  while(this->distribute_points_on_cut_lines &&
3452  coordinate_assigned_part < num_cuts){
3453  //traverse all the cut lines having the same partitiong
3454  if(ZOLTAN2_ABS(current_concurrent_cut_coordinate[coordinate_assigned_part] -
3455  current_concurrent_cut_coordinate[coordinate_assigned_part - 1])
3456  < this->sEpsilon){
3457  //if line has enough space on left, put it there.
3458  if(my_local_thread_cut_weights_to_put_left[coordinate_assigned_part] >
3459  this->sEpsilon &&
3460  my_local_thread_cut_weights_to_put_left[coordinate_assigned_part] >=
3461  ZOLTAN2_ABS(my_local_thread_cut_weights_to_put_left[coordinate_assigned_part] - coordinate_weight)){
3462  my_local_thread_cut_weights_to_put_left[coordinate_assigned_part] -= coordinate_weight;
3463  //Again if it put too much on left of the cut,
3464  //update how much the next cut sharing the same coordinate will put to its left.
3465  if(my_local_thread_cut_weights_to_put_left[coordinate_assigned_part] < 0 &&
3466  coordinate_assigned_part < num_cuts - 1 &&
3467  ZOLTAN2_ABS(current_concurrent_cut_coordinate[coordinate_assigned_part+1] -
3468  current_concurrent_cut_coordinate[coordinate_assigned_part]) < this->sEpsilon){
3469  my_local_thread_cut_weights_to_put_left[coordinate_assigned_part + 1] += my_local_thread_cut_weights_to_put_left[coordinate_assigned_part];
3470  }
3471  break;
3472  }
3473  }
3474  else {
3475  break;
3476  }
3477  ++coordinate_assigned_part;
3478  }
3479  ++thread_num_points_in_parts[coordinate_assigned_part];
3480  this->assigned_part_ids[coordinate_index] = coordinate_assigned_part;
3481  }
3482  }
3483  else {
3484  //if it is already assigned to a part, then just put it to the corresponding part.
3485  ++thread_num_points_in_parts[coordinate_assigned_part];
3486  this->assigned_part_ids[coordinate_index] = coordinate_assigned_part;
3487  }
3488  }
3489 
3490 
3491 
3492  //now we calculate where each thread will write in new_coordinate_permutations array.
3493  //first we find the out_part_xadj, by marking the begin and end points of each part found.
3494  //the below loop find the number of points in each part, and writes it to out_part_xadj
3495 #ifdef HAVE_ZOLTAN2_OMP
3496 #pragma omp for
3497 #endif
3498  for(mj_part_t j = 0; j < num_parts; ++j){
3499  mj_lno_t num_points_in_part_j_upto_thread_i = 0;
3500  for (int i = 0; i < this->num_threads; ++i){
3501  mj_lno_t thread_num_points_in_part_j = this->thread_point_counts[i][j];
3502  //prefix sum to thread point counts, so that each will have private space to write.
3503  this->thread_point_counts[i][j] = num_points_in_part_j_upto_thread_i;
3504  num_points_in_part_j_upto_thread_i += thread_num_points_in_part_j;
3505 
3506  }
3507  out_part_xadj[j] = num_points_in_part_j_upto_thread_i;// + prev2; //+ coordinateBegin;
3508  }
3509 
3510  //now we need to do a prefix sum to out_part_xadj[j], to point begin and end of each part.
3511 #ifdef HAVE_ZOLTAN2_OMP
3512 #pragma omp single
3513 #endif
3514  {
3515  //perform prefix sum for num_points in parts.
3516  for(mj_part_t j = 1; j < num_parts; ++j){
3517  out_part_xadj[j] += out_part_xadj[j - 1];
3518  }
3519  }
3520 
3521  //shift the num points in threads thread to obtain the
3522  //beginning index of each thread's private space.
3523  for(mj_part_t j = 1; j < num_parts; ++j){
3524  thread_num_points_in_parts[j] += out_part_xadj[j - 1] ;
3525  }
3526 
3527 
3528  //now thread gets the coordinate and writes the index of coordinate to the permutation array
3529  //using the part index we calculated.
3530 #ifdef HAVE_ZOLTAN2_OMP
3531 #pragma omp for
3532 #endif
3533  for (mj_lno_t ii = coordinate_begin; ii < coordinate_end; ++ii){
3534  mj_lno_t i = this->coordinate_permutations[ii];
3535  mj_part_t p = this->assigned_part_ids[i];
3536  this->new_coordinate_permutations[coordinate_begin +
3537  thread_num_points_in_parts[p]++] = i;
3538  }
3539  }
3540 }
3541 
3542 
3543 
3572 template <typename mj_scalar_t, typename mj_lno_t, typename mj_gno_t,
3573  typename mj_part_t>
3574 void AlgMJ<mj_scalar_t, mj_lno_t, mj_gno_t, mj_part_t>::mj_get_new_cut_coordinates(
3575  const size_t &num_total_part,
3576  const mj_part_t &num_cuts,
3577  const mj_scalar_t &max_coordinate,
3578  const mj_scalar_t &min_coordinate,
3579  const mj_scalar_t &global_total_weight,
3580  const mj_scalar_t &used_imbalance_tolerance,
3581  mj_scalar_t * current_global_part_weights,
3582  const mj_scalar_t * current_local_part_weights,
3583  const mj_scalar_t *current_part_target_weights,
3584  bool *current_cut_line_determined,
3585  mj_scalar_t *current_cut_coordinates,
3586  mj_scalar_t *current_cut_upper_bounds,
3587  mj_scalar_t *current_cut_lower_bounds,
3588  mj_scalar_t *current_global_left_closest_points,
3589  mj_scalar_t *current_global_right_closest_points,
3590  mj_scalar_t * current_cut_lower_bound_weights,
3591  mj_scalar_t * current_cut_upper_weights,
3592  mj_scalar_t *new_current_cut_coordinates,
3593  mj_scalar_t *current_part_cut_line_weight_to_put_left,
3594  mj_part_t *rectilinear_cut_count,
3595  mj_part_t &my_num_incomplete_cut){
3596 
3597  //seen weight in the part
3598  mj_scalar_t seen_weight_in_part = 0;
3599  //expected weight for part.
3600  mj_scalar_t expected_weight_in_part = 0;
3601  //imbalance for the left and right side of the cut.
3602  mj_scalar_t imbalance_on_left = 0, imbalance_on_right = 0;
3603 
3604 
3605 #ifdef HAVE_ZOLTAN2_OMP
3606 #pragma omp for
3607 #endif
3608  for (mj_part_t i = 0; i < num_cuts; i++){
3609  //if left and right closest points are not set yet,
3610  //set it to the cut itself.
3611  if(min_coordinate - current_global_left_closest_points[i] > this->sEpsilon)
3612  current_global_left_closest_points[i] = current_cut_coordinates[i];
3613  if(current_global_right_closest_points[i] - max_coordinate > this->sEpsilon)
3614  current_global_right_closest_points[i] = current_cut_coordinates[i];
3615 
3616  }
3617 #ifdef HAVE_ZOLTAN2_OMP
3618 #pragma omp for
3619 #endif
3620  for (mj_part_t i = 0; i < num_cuts; i++){
3621 
3622  if(this->distribute_points_on_cut_lines){
3623  //init the weight on the cut.
3624  this->global_rectilinear_cut_weight[i] = 0;
3625  this->process_rectilinear_cut_weight[i] = 0;
3626  }
3627  //if already determined at previous iterations,
3628  //then just write the coordinate to new array, and proceed.
3629  if(current_cut_line_determined[i]) {
3630  new_current_cut_coordinates[i] = current_cut_coordinates[i];
3631  continue;
3632  }
3633 
3634  //current weight of the part at the left of the cut line.
3635  seen_weight_in_part = current_global_part_weights[i * 2];
3636 
3637  /*
3638  cout << "seen_weight_in_part:" << i << " is "<< seen_weight_in_part << endl;
3639  cout << "\tcut:" << current_cut_coordinates[i]
3640  << " current_cut_lower_bounds:" << current_cut_lower_bounds[i]
3641  << " current_cut_upper_bounds:" << current_cut_upper_bounds[i] << endl;
3642  */
3643  //expected ratio
3644  expected_weight_in_part = current_part_target_weights[i];
3645  //leftImbalance = imbalanceOf(seenW, globalTotalWeight, expected);
3646  imbalance_on_left = imbalanceOf2(seen_weight_in_part, expected_weight_in_part);
3647  //rightImbalance = imbalanceOf(globalTotalWeight - seenW, globalTotalWeight, 1 - expected);
3648  imbalance_on_right = imbalanceOf2(global_total_weight - seen_weight_in_part, global_total_weight - expected_weight_in_part);
3649 
3650  bool is_left_imbalance_valid = ZOLTAN2_ABS(imbalance_on_left) - used_imbalance_tolerance < this->sEpsilon ;
3651  bool is_right_imbalance_valid = ZOLTAN2_ABS(imbalance_on_right) - used_imbalance_tolerance < this->sEpsilon;
3652 
3653  //if the cut line reaches to desired imbalance.
3654  if(is_left_imbalance_valid && is_right_imbalance_valid){
3655  current_cut_line_determined[i] = true;
3656 #ifdef HAVE_ZOLTAN2_OMP
3657 #pragma omp atomic
3658 #endif
3659  my_num_incomplete_cut -= 1;
3660  new_current_cut_coordinates [i] = current_cut_coordinates[i];
3661  continue;
3662  }
3663  else if(imbalance_on_left < 0){
3664  //if left imbalance < 0 then we need to move the cut to right.
3665 
3666  if(this->distribute_points_on_cut_lines){
3667  //if it is okay to distribute the coordinate on
3668  //the same coordinate to left and right.
3669  //then check if we can reach to the target weight by including the
3670  //coordinates in the part.
3671  if (current_global_part_weights[i * 2 + 1] == expected_weight_in_part){
3672  //if it is we are done.
3673  current_cut_line_determined[i] = true;
3674 #ifdef HAVE_ZOLTAN2_OMP
3675 #pragma omp atomic
3676 #endif
3677  my_num_incomplete_cut -= 1;
3678 
3679  //then assign everything on the cut to the left of the cut.
3680  new_current_cut_coordinates [i] = current_cut_coordinates[i];
3681 
3682  //for this cut all the weight on cut will be put to left.
3683 
3684  current_part_cut_line_weight_to_put_left[i] = current_local_part_weights[i * 2 + 1] - current_local_part_weights[i * 2];
3685  continue;
3686  }
3687  else if (current_global_part_weights[i * 2 + 1] > expected_weight_in_part){
3688 
3689  //if the weight is larger than the expected weight,
3690  //then we need to distribute some points to left, some to right.
3691  current_cut_line_determined[i] = true;
3692 #ifdef HAVE_ZOLTAN2_OMP
3693 #pragma omp atomic
3694 #endif
3695  *rectilinear_cut_count += 1;
3696  //increase the num cuts to be determined with rectilinear partitioning.
3697 
3698 #ifdef HAVE_ZOLTAN2_OMP
3699 #pragma omp atomic
3700 #endif
3701  my_num_incomplete_cut -= 1;
3702  new_current_cut_coordinates [i] = current_cut_coordinates[i];
3703  this->process_rectilinear_cut_weight[i] = current_local_part_weights[i * 2 + 1] -
3704  current_local_part_weights[i * 2];
3705  continue;
3706  }
3707  }
3708  //we need to move further right,so set lower bound to current line, and shift it to the closes point from right.
3709  current_cut_lower_bounds[i] = current_global_right_closest_points[i];
3710  //set the lower bound weight to the weight we have seen.
3711  current_cut_lower_bound_weights[i] = seen_weight_in_part;
3712 
3713  //compare the upper bound with what has been found in the last iteration.
3714  //we try to make more strict bounds for the cut here.
3715  for (mj_part_t ii = i + 1; ii < num_cuts ; ++ii){
3716  mj_scalar_t p_weight = current_global_part_weights[ii * 2];
3717  mj_scalar_t line_weight = current_global_part_weights[ii * 2 + 1];
3718 
3719  if(p_weight >= expected_weight_in_part){
3720  //if a cut on the right has the expected weight, then we found
3721  //our cut position. Set up and low coordiantes to this new cut coordinate.
3722  //but we need one more iteration to finalize the cut position,
3723  //as wee need to update the part ids.
3724  if(p_weight == expected_weight_in_part){
3725  current_cut_upper_bounds[i] = current_cut_coordinates[ii];
3726  current_cut_upper_weights[i] = p_weight;
3727  current_cut_lower_bounds[i] = current_cut_coordinates[ii];
3728  current_cut_lower_bound_weights[i] = p_weight;
3729  } else if (p_weight < current_cut_upper_weights[i]){
3730  //if a part weight is larger then my expected weight,
3731  //but lower than my upper bound weight, update upper bound.
3732  current_cut_upper_bounds[i] = current_global_left_closest_points[ii];
3733  current_cut_upper_weights[i] = p_weight;
3734  }
3735  break;
3736  }
3737  //if comes here then pw < ew
3738  //then compare the weight against line weight.
3739  if(line_weight >= expected_weight_in_part){
3740  //if the line is larger than the expected weight,
3741  //then we need to reach to the balance by distributing coordinates on this line.
3742  current_cut_upper_bounds[i] = current_cut_coordinates[ii];
3743  current_cut_upper_weights[i] = line_weight;
3744  current_cut_lower_bounds[i] = current_cut_coordinates[ii];
3745  current_cut_lower_bound_weights[i] = p_weight;
3746  break;
3747  }
3748  //if a stricter lower bound is found,
3749  //update the lower bound.
3750  if (p_weight <= expected_weight_in_part && p_weight >= current_cut_lower_bound_weights[i]){
3751  current_cut_lower_bounds[i] = current_global_right_closest_points[ii] ;
3752  current_cut_lower_bound_weights[i] = p_weight;
3753  }
3754  }
3755 
3756 
3757  mj_scalar_t new_cut_position = 0;
3758  this->mj_calculate_new_cut_position(
3759  current_cut_upper_bounds[i],
3760  current_cut_lower_bounds[i],
3761  current_cut_upper_weights[i],
3762  current_cut_lower_bound_weights[i],
3763  expected_weight_in_part, new_cut_position);
3764 
3765  //if cut line does not move significantly.
3766  //then finalize the search.
3767  if (ZOLTAN2_ABS(current_cut_coordinates[i] - new_cut_position) < this->sEpsilon
3768  /*|| current_cut_lower_bounds[i] - current_cut_upper_bounds[i] > this->sEpsilon*/
3769  ){
3770  current_cut_line_determined[i] = true;
3771 #ifdef HAVE_ZOLTAN2_OMP
3772 #pragma omp atomic
3773 #endif
3774  my_num_incomplete_cut -= 1;
3775 
3776  //set the cut coordinate and proceed.
3777  new_current_cut_coordinates [i] = current_cut_coordinates[i];
3778  } else {
3779  new_current_cut_coordinates [i] = new_cut_position;
3780  }
3781  } else {
3782 
3783  //need to move the cut line to left.
3784  //set upper bound to current line.
3785  current_cut_upper_bounds[i] = current_global_left_closest_points[i];
3786  current_cut_upper_weights[i] = seen_weight_in_part;
3787 
3788  // compare the current cut line weights with previous upper and lower bounds.
3789  for (int ii = i - 1; ii >= 0; --ii){
3790  mj_scalar_t p_weight = current_global_part_weights[ii * 2];
3791  mj_scalar_t line_weight = current_global_part_weights[ii * 2 + 1];
3792  if(p_weight <= expected_weight_in_part){
3793  if(p_weight == expected_weight_in_part){
3794  //if the weight of the part is my expected weight
3795  //then we find the solution.
3796  current_cut_upper_bounds[i] = current_cut_coordinates[ii];
3797  current_cut_upper_weights[i] = p_weight;
3798  current_cut_lower_bounds[i] = current_cut_coordinates[ii];
3799  current_cut_lower_bound_weights[i] = p_weight;
3800  }
3801  else if (p_weight > current_cut_lower_bound_weights[i]){
3802  //if found weight is bigger than the lower bound
3803  //then update the lower bound.
3804  current_cut_lower_bounds[i] = current_global_right_closest_points[ii];
3805  current_cut_lower_bound_weights[i] = p_weight;
3806 
3807  //at the same time, if weight of line is bigger than the
3808  //expected weight, then update the upper bound as well.
3809  //in this case the balance will be obtained by distributing weightss
3810  //on this cut position.
3811  if(line_weight > expected_weight_in_part){
3812  current_cut_upper_bounds[i] = current_global_right_closest_points[ii];
3813  current_cut_upper_weights[i] = line_weight;
3814  }
3815  }
3816  break;
3817  }
3818  //if the weight of the cut on the left is still bigger than my weight,
3819  //and also if the weight is smaller than the current upper weight,
3820  //or if the weight is equal to current upper weight, but on the left of
3821  // the upper weight, then update upper bound.
3822  if (p_weight >= expected_weight_in_part &&
3823  (p_weight < current_cut_upper_weights[i] ||
3824  (p_weight == current_cut_upper_weights[i] &&
3825  current_cut_upper_bounds[i] > current_global_left_closest_points[ii]
3826  )
3827  )
3828  ){
3829  current_cut_upper_bounds[i] = current_global_left_closest_points[ii] ;
3830  current_cut_upper_weights[i] = p_weight;
3831  }
3832  }
3833  mj_scalar_t new_cut_position = 0;
3834  this->mj_calculate_new_cut_position(
3835  current_cut_upper_bounds[i],
3836  current_cut_lower_bounds[i],
3837  current_cut_upper_weights[i],
3838  current_cut_lower_bound_weights[i],
3839  expected_weight_in_part,
3840  new_cut_position);
3841 
3842  //if cut line does not move significantly.
3843  if (ZOLTAN2_ABS(current_cut_coordinates[i] - new_cut_position) < this->sEpsilon
3844  /*|| current_cut_lower_bounds[i] - current_cut_upper_bounds[i] > this->sEpsilon*/ ){
3845  current_cut_line_determined[i] = true;
3846 #ifdef HAVE_ZOLTAN2_OMP
3847 #pragma omp atomic
3848 #endif
3849  my_num_incomplete_cut -= 1;
3850  //set the cut coordinate and proceed.
3851  new_current_cut_coordinates [ i] = current_cut_coordinates[i];
3852  } else {
3853  new_current_cut_coordinates [ i] = new_cut_position;
3854  }
3855  }
3856  }
3857 
3858  { // This unnecessary bracket works around a compiler bug in NVCC when enabling OpenMP as well
3859 
3860  //communication to determine the ratios of processors for the distribution
3861  //of coordinates on the cut lines.
3862 #ifdef HAVE_ZOLTAN2_OMP
3863  //no need barrier here as it is implicit.
3864 #pragma omp single
3865 #endif
3866  {
3867  if(*rectilinear_cut_count > 0){
3868 
3869  try{
3870  Teuchos::scan<int,mj_scalar_t>(
3871  *comm, Teuchos::REDUCE_SUM,
3872  num_cuts,
3873  this->process_rectilinear_cut_weight,
3874  this->global_rectilinear_cut_weight
3875  );
3876  }
3877  Z2_THROW_OUTSIDE_ERROR(*(this->mj_env))
3878 
3879  for (mj_part_t i = 0; i < num_cuts; ++i){
3880  //if cut line weight to be distributed.
3881  if(this->global_rectilinear_cut_weight[i] > 0) {
3882  //expected weight to go to left of the cut.
3883  mj_scalar_t expected_part_weight = current_part_target_weights[i];
3884  //the weight that should be put to left of the cut.
3885  mj_scalar_t necessary_weight_on_line_for_left = expected_part_weight - current_global_part_weights[i * 2];
3886  //the weight of the cut in the process
3887  mj_scalar_t my_weight_on_line = this->process_rectilinear_cut_weight[i];
3888  //the sum of the cut weights upto this process, including the weight of this process.
3889  mj_scalar_t weight_on_line_upto_process_inclusive = this->global_rectilinear_cut_weight[i];
3890  //the space on the left side of the cut after all processes before this process (including this process)
3891  //puts their weights on cut to left.
3892  mj_scalar_t space_to_put_left = necessary_weight_on_line_for_left - weight_on_line_upto_process_inclusive;
3893  //add my weight to this space to find out how much space is left to me.
3894  mj_scalar_t space_left_to_me = space_to_put_left + my_weight_on_line;
3895 
3896  /*
3897  cout << "expected_part_weight:" << expected_part_weight
3898  << " necessary_weight_on_line_for_left:" << necessary_weight_on_line_for_left
3899  << " my_weight_on_line" << my_weight_on_line
3900  << " weight_on_line_upto_process_inclusive:" << weight_on_line_upto_process_inclusive
3901  << " space_to_put_left:" << space_to_put_left
3902  << " space_left_to_me" << space_left_to_me << endl;
3903  */
3904  if(space_left_to_me < 0){
3905  //space_left_to_me is negative and i dont need to put anything to left.
3906  current_part_cut_line_weight_to_put_left[i] = 0;
3907  }
3908  else if(space_left_to_me >= my_weight_on_line){
3909  //space left to me is bigger than the weight of the processor on cut.
3910  //so put everything to left.
3911  current_part_cut_line_weight_to_put_left[i] = my_weight_on_line;
3912  //cout << "setting current_part_cut_line_weight_to_put_left to my_weight_on_line:" << my_weight_on_line << endl;
3913  }
3914  else {
3915  //put only the weight as much as the space.
3916  current_part_cut_line_weight_to_put_left[i] = space_left_to_me ;
3917 
3918  //cout << "setting current_part_cut_line_weight_to_put_left to space_left_to_me:" << space_left_to_me << endl;
3919  }
3920 
3921  }
3922  }
3923  *rectilinear_cut_count = 0;
3924  }
3925  }
3926  }
3927 }
3928 
3938 template <typename mj_scalar_t, typename mj_lno_t, typename mj_gno_t,
3939  typename mj_part_t>
3940 void AlgMJ<mj_scalar_t, mj_lno_t, mj_gno_t, mj_part_t>::get_processor_num_points_in_parts(
3941  mj_part_t num_procs,
3942  mj_part_t num_parts,
3943  mj_gno_t *&num_points_in_all_processor_parts){
3944 
3945  //initially allocation_size is num_parts
3946  size_t allocation_size = num_parts * (num_procs + 1);
3947 
3948  //this will be output
3949  //holds how many each processor has in each part.
3950  //last portion is the sum of all processor points in each part.
3951 
3952  //allocate memory for the local num coordinates in each part.
3953  mj_gno_t *num_local_points_in_each_part_to_reduce_sum = allocMemory<mj_gno_t>(allocation_size);
3954 
3955 
3956  //this is the portion of the memory which will be used
3957  //at the summation to obtain total number of processors' points in each part.
3958  mj_gno_t *my_local_points_to_reduce_sum = num_local_points_in_each_part_to_reduce_sum + num_procs * num_parts;
3959  //this is the portion of the memory where each stores its local number.
3960  //this information is needed by other processors.
3961  mj_gno_t *my_local_point_counts_in_each_art = num_local_points_in_each_part_to_reduce_sum + this->myRank * num_parts;
3962 
3963  //initialize the array with 0's.
3964  memset(num_local_points_in_each_part_to_reduce_sum, 0, sizeof(mj_gno_t)*allocation_size);
3965 
3966  //write the number of coordinates in each part.
3967  for (mj_part_t i = 0; i < num_parts; ++i){
3968  mj_lno_t part_begin_index = 0;
3969  if (i > 0){
3970  part_begin_index = this->new_part_xadj[i - 1];
3971  }
3972  mj_lno_t part_end_index = this->new_part_xadj[i];
3973  my_local_points_to_reduce_sum[i] = part_end_index - part_begin_index;
3974  }
3975 
3976  //copy the local num parts to the last portion of array,
3977  //so that this portion will represent the global num points in each part after the reduction.
3978  memcpy (my_local_point_counts_in_each_art,
3979  my_local_points_to_reduce_sum,
3980  sizeof(mj_gno_t) * (num_parts) );
3981 
3982 
3983  //reduceAll operation.
3984  //the portion that belongs to a processor with index p
3985  //will start from myRank * num_parts.
3986  //the global number of points will be held at the index
3987  try{
3988  reduceAll<int, mj_gno_t>(
3989  *(this->comm),
3990  Teuchos::REDUCE_SUM,
3991  allocation_size,
3992  num_local_points_in_each_part_to_reduce_sum,
3993  num_points_in_all_processor_parts);
3994  }
3995  Z2_THROW_OUTSIDE_ERROR(*(this->mj_env))
3996  freeArray<mj_gno_t>(num_local_points_in_each_part_to_reduce_sum);
3997 }
3998 
3999 
4000 
4013 template <typename mj_scalar_t, typename mj_lno_t, typename mj_gno_t,
4014  typename mj_part_t>
4015 bool AlgMJ<mj_scalar_t, mj_lno_t, mj_gno_t, mj_part_t>::mj_check_to_migrate(
4016  size_t migration_reduce_all_population,
4017  mj_lno_t num_coords_for_last_dim_part,
4018  mj_part_t num_procs,
4019  mj_part_t num_parts,
4020  mj_gno_t *num_points_in_all_processor_parts){
4021 
4022  //if reduce all count and population in the last dim is too high
4023  if (migration_reduce_all_population > FUTURE_REDUCEALL_CUTOFF) return true;
4024  //if the work in a part per processor in the last dim is too low.
4025  if (num_coords_for_last_dim_part < MIN_WORK_LAST_DIM) return true;
4026 
4027  //if migration is to be checked and the imbalance is too high
4028  if (this->check_migrate_avoid_migration_option == 0){
4029  double global_imbalance = 0;
4030  //global shift to reach the sum of coordiante count in each part.
4031  size_t global_shift = num_procs * num_parts;
4032 
4033  for (mj_part_t ii = 0; ii < num_procs; ++ii){
4034  for (mj_part_t i = 0; i < num_parts; ++i){
4035  double ideal_num = num_points_in_all_processor_parts[global_shift + i]
4036  / double(num_procs);
4037 
4038  global_imbalance += ZOLTAN2_ABS(ideal_num -
4039  num_points_in_all_processor_parts[ii * num_parts + i]) / (ideal_num);
4040  }
4041  }
4042  global_imbalance /= num_parts;
4043  global_imbalance /= num_procs;
4044 
4045  /*
4046  if (this->myRank == 0) {
4047  cout << "imbalance for next iteration:" << global_imbalance << endl;
4048  }
4049  */
4050 
4051  if(global_imbalance <= this->minimum_migration_imbalance){
4052  return false;
4053  }
4054  else {
4055  return true;
4056  }
4057  }
4058  else {
4059  //if migration is forced
4060  return true;
4061  }
4062 }
4063 
4064 
4074 template <typename mj_scalar_t, typename mj_lno_t, typename mj_gno_t,
4075  typename mj_part_t>
4076 void AlgMJ<mj_scalar_t, mj_lno_t, mj_gno_t, mj_part_t>::assign_send_destinations(
4077  mj_part_t num_parts,
4078  mj_part_t *part_assignment_proc_begin_indices,
4079  mj_part_t *processor_chains_in_parts,
4080  mj_lno_t *send_count_to_each_proc,
4081  int *coordinate_destinations){
4082 
4083  for (mj_part_t p = 0; p < num_parts; ++p){
4084  mj_lno_t part_begin = 0;
4085  if (p > 0) part_begin = this->new_part_xadj[p - 1];
4086  mj_lno_t part_end = this->new_part_xadj[p];
4087 
4088  //get the first part that current processor will send its part-p.
4089  mj_part_t proc_to_sent = part_assignment_proc_begin_indices[p];
4090  //initialize how many point I sent to this processor.
4091  mj_lno_t num_total_send = 0;
4092  for (mj_lno_t j=part_begin; j < part_end; j++){
4093  mj_lno_t local_ind = this->new_coordinate_permutations[j];
4094  while (num_total_send >= send_count_to_each_proc[proc_to_sent]){
4095  //then get the next processor to send the points in part p.
4096  num_total_send = 0;
4097  //assign new processor to part_assign_begin[p]
4098  part_assignment_proc_begin_indices[p] = processor_chains_in_parts[proc_to_sent];
4099  //remove the previous processor
4100  processor_chains_in_parts[proc_to_sent] = -1;
4101  //choose the next processor as the next one to send.
4102  proc_to_sent = part_assignment_proc_begin_indices[p];
4103  }
4104  //write the gno index to corresponding position in sendBuf.
4105  coordinate_destinations[local_ind] = proc_to_sent;
4106  ++num_total_send;
4107  }
4108  }
4109 }
4110 
4125 template <typename mj_scalar_t, typename mj_lno_t, typename mj_gno_t,
4126  typename mj_part_t>
4127 void AlgMJ<mj_scalar_t, mj_lno_t, mj_gno_t, mj_part_t>::mj_assign_proc_to_parts(
4128  mj_gno_t * num_points_in_all_processor_parts,
4129  mj_part_t num_parts,
4130  mj_part_t num_procs,
4131  mj_lno_t *send_count_to_each_proc,
4132  std::vector<mj_part_t> &processor_ranks_for_subcomm,
4133  std::vector<mj_part_t> *next_future_num_parts_in_parts,
4134  mj_part_t &out_part_index,
4135  mj_part_t &output_part_numbering_begin_index,
4136  int *coordinate_destinations){
4137 
4138 
4139  mj_gno_t *global_num_points_in_parts = num_points_in_all_processor_parts + num_procs * num_parts;
4140  mj_part_t *num_procs_assigned_to_each_part = allocMemory<mj_part_t>(num_parts);
4141 
4142  //boolean variable if the process finds its part to be assigned.
4143  bool did_i_find_my_group = false;
4144 
4145  mj_part_t num_free_procs = num_procs;
4146  mj_part_t minimum_num_procs_required_for_rest_of_parts = num_parts - 1;
4147 
4148  double max_imbalance_difference = 0;
4149  mj_part_t max_differing_part = 0;
4150 
4151  //find how many processor each part requires.
4152  for (mj_part_t i=0; i < num_parts; i++){
4153 
4154  //scalar portion of the required processors
4155  double scalar_required_proc = num_procs *
4156  (double (global_num_points_in_parts[i]) / double (this->num_global_coords));
4157 
4158  //round it to closest integer.
4159  mj_part_t required_proc = static_cast<mj_part_t> (0.5 + scalar_required_proc);
4160 
4161  //if assigning the required num procs, creates problems for the rest of the parts.
4162  //then only assign {num_free_procs - (minimum_num_procs_required_for_rest_of_parts)} procs to this part.
4163  if (num_free_procs - required_proc < minimum_num_procs_required_for_rest_of_parts){
4164  required_proc = num_free_procs - (minimum_num_procs_required_for_rest_of_parts);
4165  }
4166 
4167  //reduce the free processor count
4168  num_free_procs -= required_proc;
4169  //reduce the free minimum processor count required for the rest of the part by 1.
4170  --minimum_num_procs_required_for_rest_of_parts;
4171 
4172  //part (i) is assigned to (required_proc) processors.
4173  num_procs_assigned_to_each_part[i] = required_proc;
4174 
4175  //because of the roundings some processors might be left as unassigned.
4176  //we want to assign those processors to the part with most imbalance.
4177  //find the part with the maximum imbalance here.
4178  double imbalance_wrt_ideal = (scalar_required_proc - required_proc) / required_proc;
4179  if (imbalance_wrt_ideal > max_imbalance_difference){
4180  max_imbalance_difference = imbalance_wrt_ideal;
4181  max_differing_part = i;
4182  }
4183  }
4184 
4185  //assign extra processors to the part with maximum imbalance than the ideal.
4186  if (num_free_procs > 0){
4187  num_procs_assigned_to_each_part[max_differing_part] += num_free_procs;
4188  }
4189 
4190  //now find what are the best processors with least migration for each part.
4191 
4192  //part_assignment_proc_begin_indices ([i]) is the array that holds the beginning
4193  //index of a processor that processor sends its data for part - i
4194  mj_part_t *part_assignment_proc_begin_indices = allocMemory<mj_part_t>(num_parts);
4195  //the next processor send is found in processor_chains_in_parts, in linked list manner.
4196  mj_part_t *processor_chains_in_parts = allocMemory<mj_part_t>(num_procs);
4197  mj_part_t *processor_part_assignments = allocMemory<mj_part_t>(num_procs);
4198 
4199  //initialize the assignment of each processor.
4200  //this has a linked list implementation.
4201  //the beginning of processors assigned
4202  //to each part is hold at part_assignment_proc_begin_indices[part].
4203  //then the next processor assigned to that part is located at
4204  //proc_part_assignments[part_assign_begins[part]], this is a chain
4205  //until the value of -1 is reached.
4206  for (int i = 0; i < num_procs; ++i ){
4207  processor_part_assignments[i] = -1;
4208  processor_chains_in_parts[i] = -1;
4209  }
4210  for (int i = 0; i < num_parts; ++i ){
4211  part_assignment_proc_begin_indices[i] = -1;
4212  }
4213 
4214 
4215  //Allocate memory for sorting data structure.
4216  uSignedSortItem<mj_part_t, mj_gno_t, char> * sort_item_num_part_points_in_procs = allocMemory <uSignedSortItem<mj_part_t, mj_gno_t, char> > (num_procs);
4217  for(mj_part_t i = 0; i < num_parts; ++i){
4218  //the algorithm tries to minimize the cost of migration,
4219  //by assigning the processors with highest number of coordinates on that part.
4220  //here we might want to implement a maximum weighted bipartite matching algorithm.
4221  for(mj_part_t ii = 0; ii < num_procs; ++ii){
4222  sort_item_num_part_points_in_procs[ii].id = ii;
4223  //if processor is not assigned yet.
4224  //add its num points to the sort data structure.
4225  if (processor_part_assignments[ii] == -1){
4226  sort_item_num_part_points_in_procs[ii].val = num_points_in_all_processor_parts[ii * num_parts + i];
4227  sort_item_num_part_points_in_procs[ii].signbit = 1; //indicate that the processor has positive weight.
4228  }
4229  else {
4230  //if processor is already assigned, insert -nLocal - 1 so that it won't be selected again.
4231  //would be same if we simply set it to -1,
4232  //but more information with no extra cost (which is used later) is provided.
4233  //sort_item_num_part_points_in_procs[ii].val = -num_points_in_all_processor_parts[ii * num_parts + i] - 1;
4234 
4235  //UPDATE: Since above gets warning when unsigned is used to represent, we added extra bit to as sign bit to the sort item.
4236  //It is 1 for positives, 0 for negatives.
4237  sort_item_num_part_points_in_procs[ii].val = num_points_in_all_processor_parts[ii * num_parts + i];
4238  sort_item_num_part_points_in_procs[ii].signbit = 0;
4239  }
4240  }
4241  //sort the processors in the part.
4242  uqSignsort<mj_part_t, mj_gno_t,char>(num_procs, sort_item_num_part_points_in_procs);
4243 
4244  /*
4245  for(mj_part_t ii = 0; ii < num_procs; ++ii){
4246  std::cout << "ii:" << ii << " " << sort_item_num_part_points_in_procs[ii].id <<
4247  " " << sort_item_num_part_points_in_procs[ii].val <<
4248  " " << int(sort_item_num_part_points_in_procs[ii].signbit) << std::endl;
4249  }
4250  */
4251 
4252  mj_part_t required_proc_count = num_procs_assigned_to_each_part[i];
4253  mj_gno_t total_num_points_in_part = global_num_points_in_parts[i];
4254  mj_gno_t ideal_num_points_in_a_proc =
4255  Teuchos::as<mj_gno_t>(ceil (total_num_points_in_part / double (required_proc_count)));
4256 
4257  //starts sending to least heaviest part.
4258  mj_part_t next_proc_to_send_index = num_procs - required_proc_count;
4259  mj_part_t next_proc_to_send_id = sort_item_num_part_points_in_procs[next_proc_to_send_index].id;
4260  mj_lno_t space_left_in_sent_proc = ideal_num_points_in_a_proc - sort_item_num_part_points_in_procs[next_proc_to_send_index].val;
4261 
4262  //find the processors that will be assigned to this part, which are the heaviest
4263  //non assigned processors.
4264  for(mj_part_t ii = num_procs - 1; ii >= num_procs - required_proc_count; --ii){
4265  mj_part_t proc_id = sort_item_num_part_points_in_procs[ii].id;
4266  //assign processor to part - i.
4267  processor_part_assignments[proc_id] = i;
4268  }
4269 
4270  bool did_change_sign = false;
4271  //if processor has a minus count, reverse it.
4272  for(mj_part_t ii = 0; ii < num_procs; ++ii){
4273  // TODO: THE LINE BELOW PRODUCES A WARNING IF gno_t IS UNSIGNED
4274  // TODO: SEE BUG 6194
4275  if (sort_item_num_part_points_in_procs[ii].signbit == 0){
4276  did_change_sign = true;
4277  sort_item_num_part_points_in_procs[ii].signbit = 1;
4278  }
4279  else {
4280  break;
4281  }
4282  }
4283  if(did_change_sign){
4284  //resort the processors in the part for the rest of the processors that is not assigned.
4285  uqSignsort<mj_part_t, mj_gno_t>(num_procs - required_proc_count, sort_item_num_part_points_in_procs);
4286  }
4287  /*
4288  for(mj_part_t ii = 0; ii < num_procs; ++ii){
4289  std::cout << "after resort ii:" << ii << " " << sort_item_num_part_points_in_procs[ii].id <<
4290  " " << sort_item_num_part_points_in_procs[ii].val <<
4291  " " << int(sort_item_num_part_points_in_procs[ii].signbit ) << std::endl;
4292  }
4293  */
4294 
4295  //check if this processors is one of the procs assigned to this part.
4296  //if it is, then get the group.
4297  if (!did_i_find_my_group){
4298  for(mj_part_t ii = num_procs - 1; ii >= num_procs - required_proc_count; --ii){
4299 
4300  mj_part_t proc_id_to_assign = sort_item_num_part_points_in_procs[ii].id;
4301  //add the proc to the group.
4302  processor_ranks_for_subcomm.push_back(proc_id_to_assign);
4303 
4304  if(proc_id_to_assign == this->myRank){
4305  //if the assigned process is me, then I find my group.
4306  did_i_find_my_group = true;
4307  //set the beginning of part i to my rank.
4308  part_assignment_proc_begin_indices[i] = this->myRank;
4309  processor_chains_in_parts[this->myRank] = -1;
4310 
4311  //set send count to myself to the number of points that I have in part i.
4312  send_count_to_each_proc[this->myRank] = sort_item_num_part_points_in_procs[ii].val;
4313 
4314  //calculate the shift required for the output_part_numbering_begin_index
4315  for (mj_part_t in = 0; in < i; ++in){
4316  output_part_numbering_begin_index += (*next_future_num_parts_in_parts)[in];
4317  }
4318  out_part_index = i;
4319  }
4320  }
4321  //if these was not my group,
4322  //clear the subcomminicator processor array.
4323  if (!did_i_find_my_group){
4324  processor_ranks_for_subcomm.clear();
4325  }
4326  }
4327 
4328  //send points of the nonassigned coordinates to the assigned coordinates.
4329  //starts from the heaviest nonassigned processor.
4330  //TODO we might want to play with this part, that allows more computational imbalance
4331  //but having better communication balance.
4332  for(mj_part_t ii = num_procs - required_proc_count - 1; ii >= 0; --ii){
4333  mj_part_t nonassigned_proc_id = sort_item_num_part_points_in_procs[ii].id;
4334  mj_lno_t num_points_to_sent = sort_item_num_part_points_in_procs[ii].val;
4335 
4336  //we set number of points to -to_sent - 1 for the assigned processors.
4337  //we reverse it here. This should not happen, as we have already reversed them above.
4338 #ifdef MJ_DEBUG
4339  if (num_points_to_sent < 0) {
4340  cout << "Migration - processor assignments - for part:" << i << "from proc:" << nonassigned_proc_id << " num_points_to_sent:" << num_points_to_sent << std::endl;
4341  exit(1);
4342  }
4343 #endif
4344 
4345  //now sends the points to the assigned processors.
4346  while (num_points_to_sent > 0){
4347  //if the processor has enough space.
4348  if (num_points_to_sent <= space_left_in_sent_proc){
4349  //reduce the space left in the processor.
4350  space_left_in_sent_proc -= num_points_to_sent;
4351  //if my rank is the one that is sending the coordinates.
4352  if (this->myRank == nonassigned_proc_id){
4353  //set my sent count to the sent processor.
4354  send_count_to_each_proc[next_proc_to_send_id] = num_points_to_sent;
4355  //save the processor in the list (processor_chains_in_parts and part_assignment_proc_begin_indices)
4356  //that the processor will send its point in part-i.
4357  mj_part_t prev_begin = part_assignment_proc_begin_indices[i];
4358  part_assignment_proc_begin_indices[i] = next_proc_to_send_id;
4359  processor_chains_in_parts[next_proc_to_send_id] = prev_begin;
4360  }
4361  num_points_to_sent = 0;
4362  }
4363  else {
4364  //there might be no space left in the processor.
4365  if(space_left_in_sent_proc > 0){
4366  num_points_to_sent -= space_left_in_sent_proc;
4367 
4368  //send as the space left in the processor.
4369  if (this->myRank == nonassigned_proc_id){
4370  //send as much as the space in this case.
4371  send_count_to_each_proc[next_proc_to_send_id] = space_left_in_sent_proc;
4372  mj_part_t prev_begin = part_assignment_proc_begin_indices[i];
4373  part_assignment_proc_begin_indices[i] = next_proc_to_send_id;
4374  processor_chains_in_parts[next_proc_to_send_id] = prev_begin;
4375 
4376  }
4377  }
4378  //change the sent part
4379  ++next_proc_to_send_index;
4380 
4381 #ifdef MJ_DEBUG
4382  if(next_part_to_send_index < nprocs - required_proc_count ){
4383  cout << "Migration - processor assignments - for part:"
4384  << i
4385  << " next_part_to_send :" << next_part_to_send_index
4386  << " nprocs:" << nprocs
4387  << " required_proc_count:" << required_proc_count
4388  << " Error: next_part_to_send_index < nprocs - required_proc_count" << std::endl;
4389  exit(1)l
4390 
4391  }
4392 #endif
4393  //send the new id.
4394  next_proc_to_send_id = sort_item_num_part_points_in_procs[next_proc_to_send_index].id;
4395  //set the new space in the processor.
4396  space_left_in_sent_proc = ideal_num_points_in_a_proc - sort_item_num_part_points_in_procs[next_proc_to_send_index].val;
4397  }
4398  }
4399  }
4400  }
4401 
4402 
4403 
4404  this->assign_send_destinations(
4405  num_parts,
4406  part_assignment_proc_begin_indices,
4407  processor_chains_in_parts,
4408  send_count_to_each_proc,
4409  coordinate_destinations);
4410 
4411  freeArray<mj_part_t>(part_assignment_proc_begin_indices);
4412  freeArray<mj_part_t>(processor_chains_in_parts);
4413  freeArray<mj_part_t>(processor_part_assignments);
4414  freeArray<uSignedSortItem<mj_part_t, mj_gno_t, char> > (sort_item_num_part_points_in_procs);
4415  freeArray<mj_part_t > (num_procs_assigned_to_each_part);
4416 
4417 }
4418 
4419 
4432 template <typename mj_scalar_t, typename mj_lno_t, typename mj_gno_t,
4433  typename mj_part_t>
4434 void AlgMJ<mj_scalar_t, mj_lno_t, mj_gno_t, mj_part_t>::assign_send_destinations2(
4435  mj_part_t num_parts,
4436  uSortItem<mj_part_t, mj_part_t> * sort_item_part_to_proc_assignment, //input sorted wrt processors
4437  int *coordinate_destinations,
4438  mj_part_t &output_part_numbering_begin_index,
4439  std::vector<mj_part_t> *next_future_num_parts_in_parts){
4440 
4441  mj_part_t part_shift_amount = output_part_numbering_begin_index;
4442  mj_part_t previous_processor = -1;
4443  for(mj_part_t i = 0; i < num_parts; ++i){
4444  mj_part_t p = sort_item_part_to_proc_assignment[i].id;
4445  //assigned processors are sorted.
4446  mj_lno_t part_begin_index = 0;
4447  if (p > 0) part_begin_index = this->new_part_xadj[p - 1];
4448  mj_lno_t part_end_index = this->new_part_xadj[p];
4449 
4450  mj_part_t assigned_proc = sort_item_part_to_proc_assignment[i].val;
4451  if (this->myRank == assigned_proc && previous_processor != assigned_proc){
4452  output_part_numbering_begin_index = part_shift_amount;
4453  }
4454  previous_processor = assigned_proc;
4455  part_shift_amount += (*next_future_num_parts_in_parts)[p];
4456 
4457  for (mj_lno_t j=part_begin_index; j < part_end_index; j++){
4458  mj_lno_t localInd = this->new_coordinate_permutations[j];
4459  coordinate_destinations[localInd] = assigned_proc;
4460  }
4461  }
4462 }
4463 
4464 
4481 template <typename mj_scalar_t, typename mj_lno_t, typename mj_gno_t,
4482  typename mj_part_t>
4483 void AlgMJ<mj_scalar_t, mj_lno_t, mj_gno_t, mj_part_t>::mj_assign_parts_to_procs(
4484  mj_gno_t * num_points_in_all_processor_parts,
4485  mj_part_t num_parts,
4486  mj_part_t num_procs,
4487  mj_lno_t *send_count_to_each_proc, //output: sized nprocs, show the number of send point counts to each proc.
4488  std::vector<mj_part_t> *next_future_num_parts_in_parts,//input how many more partitions the part will be partitioned into.
4489  mj_part_t &out_num_part, //output, how many parts the processor will have. this is always 1 for this function.
4490  std::vector<mj_part_t> &out_part_indices, //output: the part indices which the processor is assigned to.
4491  mj_part_t &output_part_numbering_begin_index, //output: how much the part number should be shifted when setting the solution
4492  int *coordinate_destinations){
4493  out_num_part = 0;
4494 
4495  mj_gno_t *global_num_points_in_parts = num_points_in_all_processor_parts + num_procs * num_parts;
4496  out_part_indices.clear();
4497 
4498  //to sort the parts that is assigned to the processors.
4499  //id is the part number, sort value is the assigned processor id.
4500  uSortItem<mj_part_t, mj_part_t> * sort_item_part_to_proc_assignment = allocMemory <uSortItem<mj_part_t, mj_part_t> >(num_parts);
4501  uSortItem<mj_part_t, mj_gno_t> * sort_item_num_points_of_proc_in_part_i = allocMemory <uSortItem<mj_part_t, mj_gno_t> >(num_procs);
4502 
4503 
4504  //calculate the optimal number of coordinates that should be assigned to each processor.
4505  mj_lno_t work_each = mj_lno_t (this->num_global_coords / (double (num_procs)) + 0.5f);
4506  //to hold the left space as the number of coordinates to the optimal number in each proc.
4507  mj_lno_t *space_in_each_processor = allocMemory <mj_lno_t>(num_procs);
4508  //initialize left space in each.
4509  for (mj_part_t i = 0; i < num_procs; ++i){
4510  space_in_each_processor[i] = work_each;
4511  }
4512 
4513  //we keep track of how many parts each processor is assigned to.
4514  //because in some weird inputs, it might be possible that some
4515  //processors is not assigned to any part. Using these variables,
4516  //we force each processor to have at least one part.
4517  mj_part_t *num_parts_proc_assigned = allocMemory <mj_part_t>(num_procs);
4518  memset(num_parts_proc_assigned, 0, sizeof(mj_part_t) * num_procs);
4519  int empty_proc_count = num_procs;
4520 
4521  //to sort the parts with decreasing order of their coordiantes.
4522  //id are the part numbers, sort value is the number of points in each.
4523  uSortItem<mj_part_t, mj_gno_t> * sort_item_point_counts_in_parts = allocMemory <uSortItem<mj_part_t, mj_gno_t> >(num_parts);
4524 
4525  //initially we will sort the parts according to the number of coordinates they have.
4526  //so that we will start assigning with the part that has the most number of coordinates.
4527  for (mj_part_t i = 0; i < num_parts; ++i){
4528  sort_item_point_counts_in_parts[i].id = i;
4529  sort_item_point_counts_in_parts[i].val = global_num_points_in_parts[i];
4530  }
4531  //sort parts with increasing order of loads.
4532  uqsort<mj_part_t, mj_gno_t>(num_parts, sort_item_point_counts_in_parts);
4533 
4534 
4535  //assigning parts to the processors
4536  //traverse the part win decreasing order of load.
4537  //first assign the heaviest part.
4538  for (mj_part_t j = 0; j < num_parts; ++j){
4539  //sorted with increasing order, traverse inverse.
4540  mj_part_t i = sort_item_point_counts_in_parts[num_parts - 1 - j].id;
4541  //load of the part
4542  mj_gno_t load = global_num_points_in_parts[i];
4543 
4544  //assigned processors
4545  mj_part_t assigned_proc = -1;
4546  //if not fit best processor.
4547  mj_part_t best_proc_to_assign = 0;
4548 
4549 
4550  //sort processors with increasing number of points in this part.
4551  for (mj_part_t ii = 0; ii < num_procs; ++ii){
4552  sort_item_num_points_of_proc_in_part_i[ii].id = ii;
4553 
4554  //if there are still enough parts to fill empty processors, than proceed normally.
4555  //but if empty processor count is equal to the number of part, then
4556  //we force to part assignments only to empty processors.
4557  if (empty_proc_count < num_parts - j || num_parts_proc_assigned[ii] == 0){
4558  //how many points processor ii has in part i?
4559  sort_item_num_points_of_proc_in_part_i[ii].val = num_points_in_all_processor_parts[ii * num_parts + i];
4560  }
4561  else {
4562  sort_item_num_points_of_proc_in_part_i[ii].val = -1;
4563  }
4564  }
4565  uqsort<mj_part_t, mj_gno_t>(num_procs, sort_item_num_points_of_proc_in_part_i);
4566 
4567  //traverse all processors with decreasing load.
4568  for (mj_part_t iii = num_procs - 1; iii >= 0; --iii){
4569  mj_part_t ii = sort_item_num_points_of_proc_in_part_i[iii].id;
4570  mj_lno_t left_space = space_in_each_processor[ii] - load;
4571  //if enought space, assign to this part.
4572  if(left_space >= 0 ){
4573  assigned_proc = ii;
4574  break;
4575  }
4576  //if space is not enough, store the best candidate part.
4577  if (space_in_each_processor[best_proc_to_assign] < space_in_each_processor[ii]){
4578  best_proc_to_assign = ii;
4579  }
4580  }
4581 
4582  //if none had enough space, then assign it to best part.
4583  if (assigned_proc == -1){
4584  assigned_proc = best_proc_to_assign;
4585  }
4586 
4587  if (num_parts_proc_assigned[assigned_proc]++ == 0){
4588  --empty_proc_count;
4589  }
4590  space_in_each_processor[assigned_proc] -= load;
4591  //to sort later, part-i is assigned to the proccessor - assignment.
4592  sort_item_part_to_proc_assignment[j].id = i; //part i
4593  sort_item_part_to_proc_assignment[j].val = assigned_proc; //assigned to processor - assignment.
4594 
4595 
4596  //if assigned processor is me, increase the number.
4597  if (assigned_proc == this->myRank){
4598  out_num_part++;//assigned_part_count;
4599  out_part_indices.push_back(i);
4600  }
4601  //increase the send to that processor by the number of points in that part.
4602  //as everyone send their coordiantes in this part to the processor assigned to this part.
4603  send_count_to_each_proc[assigned_proc] += num_points_in_all_processor_parts[this->myRank * num_parts + i];
4604  }
4605  freeArray<mj_part_t>(num_parts_proc_assigned);
4606  freeArray< uSortItem<mj_part_t, mj_gno_t> > (sort_item_num_points_of_proc_in_part_i);
4607  freeArray<uSortItem<mj_part_t, mj_gno_t> >(sort_item_point_counts_in_parts);
4608  freeArray<mj_lno_t >(space_in_each_processor);
4609 
4610 
4611  //sort assignments with respect to the assigned processors.
4612  uqsort<mj_part_t, mj_part_t>(num_parts, sort_item_part_to_proc_assignment);
4613  //fill sendBuf.
4614 
4615 
4616  this->assign_send_destinations2(
4617  num_parts,
4618  sort_item_part_to_proc_assignment,
4619  coordinate_destinations,
4620  output_part_numbering_begin_index,
4621  next_future_num_parts_in_parts);
4622 
4623  freeArray<uSortItem<mj_part_t, mj_part_t> >(sort_item_part_to_proc_assignment);
4624 }
4625 
4626 
4644 template <typename mj_scalar_t, typename mj_lno_t, typename mj_gno_t,
4645  typename mj_part_t>
4646 void AlgMJ<mj_scalar_t, mj_lno_t, mj_gno_t, mj_part_t>::mj_migration_part_proc_assignment(
4647  mj_gno_t * num_points_in_all_processor_parts,
4648  mj_part_t num_parts,
4649  mj_part_t num_procs,
4650  mj_lno_t *send_count_to_each_proc,
4651  std::vector<mj_part_t> &processor_ranks_for_subcomm,
4652  std::vector<mj_part_t> *next_future_num_parts_in_parts,
4653  mj_part_t &out_num_part,
4654  std::vector<mj_part_t> &out_part_indices,
4655  mj_part_t &output_part_numbering_begin_index,
4656  int *coordinate_destinations){
4657 
4658 
4659 
4660  processor_ranks_for_subcomm.clear();
4661  // if (this->num_local_coords > 0)
4662  if (num_procs > num_parts){
4663  //if there are more processors than the number of current part
4664  //then processors share the existing parts.
4665  //at the end each processor will have a single part,
4666  //but a part will be shared by a group of processors.
4667  mj_part_t out_part_index = 0;
4668  this->mj_assign_proc_to_parts(
4669  num_points_in_all_processor_parts,
4670  num_parts,
4671  num_procs,
4672  send_count_to_each_proc,
4673  processor_ranks_for_subcomm,
4674  next_future_num_parts_in_parts,
4675  out_part_index,
4676  output_part_numbering_begin_index,
4677  coordinate_destinations
4678  );
4679 
4680  out_num_part = 1;
4681  out_part_indices.clear();
4682  out_part_indices.push_back(out_part_index);
4683  }
4684  else {
4685 
4686  //there are more parts than the processors.
4687  //therefore a processor will be assigned multiple parts,
4688  //the subcommunicators will only have a single processor.
4689  processor_ranks_for_subcomm.push_back(this->myRank);
4690 
4691  //since there are more parts then procs,
4692  //assign multiple parts to processors.
4693  this->mj_assign_parts_to_procs(
4694  num_points_in_all_processor_parts,
4695  num_parts,
4696  num_procs,
4697  send_count_to_each_proc,
4698  next_future_num_parts_in_parts,
4699  out_num_part,
4700  out_part_indices,
4701  output_part_numbering_begin_index,
4702  coordinate_destinations);
4703  }
4704 }
4705 
4718 template <typename mj_scalar_t, typename mj_lno_t, typename mj_gno_t,
4719  typename mj_part_t>
4720 void AlgMJ<mj_scalar_t, mj_lno_t, mj_gno_t, mj_part_t>::mj_migrate_coords(
4721  mj_part_t num_procs,
4722  mj_lno_t &num_new_local_points,
4723  std::string iteration,
4724  int *coordinate_destinations,
4725  mj_part_t num_parts)
4726 {
4727 #ifdef ENABLE_ZOLTAN_MIGRATION
4728  if (sizeof(mj_lno_t) <= sizeof(int)) {
4729 
4730  // Cannot use Zoltan_Comm with local ordinals larger than ints.
4731  // In Zoltan_Comm_Create, the cast int(this->num_local_coords)
4732  // may overflow.
4733 
4734  ZOLTAN_COMM_OBJ *plan = NULL;
4735  MPI_Comm mpi_comm = Teuchos::getRawMpiComm(*(this->comm));
4736  int num_incoming_gnos = 0;
4737  int message_tag = 7859;
4738 
4739  this->mj_env->timerStart(MACRO_TIMERS, "MultiJagged - Migration Z1PlanCreating-" + iteration);
4740  int ierr = Zoltan_Comm_Create(
4741  &plan,
4742  int(this->num_local_coords),
4743  coordinate_destinations,
4744  mpi_comm,
4745  message_tag,
4746  &num_incoming_gnos);
4747  Z2_ASSERT_VALUE(ierr, ZOLTAN_OK);
4748  this->mj_env->timerStop(MACRO_TIMERS, "MultiJagged - Migration Z1PlanCreating-" + iteration);
4749 
4750  this->mj_env->timerStart(MACRO_TIMERS, "MultiJagged - Migration Z1Migration-" + iteration);
4751  mj_gno_t *incoming_gnos = allocMemory< mj_gno_t>(num_incoming_gnos);
4752 
4753  //migrate gnos.
4754  message_tag++;
4755  ierr = Zoltan_Comm_Do(
4756  plan,
4757  message_tag,
4758  (char *) this->current_mj_gnos,
4759  sizeof(mj_gno_t),
4760  (char *) incoming_gnos);
4761  Z2_ASSERT_VALUE(ierr, ZOLTAN_OK);
4762 
4763  freeArray<mj_gno_t>(this->current_mj_gnos);
4764  this->current_mj_gnos = incoming_gnos;
4765 
4766 
4767  //migrate coordinates
4768  for (int i = 0; i < this->coord_dim; ++i){
4769  message_tag++;
4770  mj_scalar_t *coord = this->mj_coordinates[i];
4771 
4772  this->mj_coordinates[i] = allocMemory<mj_scalar_t>(num_incoming_gnos);
4773  ierr = Zoltan_Comm_Do(
4774  plan,
4775  message_tag,
4776  (char *) coord,
4777  sizeof(mj_scalar_t),
4778  (char *) this->mj_coordinates[i]);
4779  Z2_ASSERT_VALUE(ierr, ZOLTAN_OK);
4780  freeArray<mj_scalar_t>(coord);
4781  }
4782 
4783  //migrate weights.
4784  for (int i = 0; i < this->num_weights_per_coord; ++i){
4785  message_tag++;
4786  mj_scalar_t *weight = this->mj_weights[i];
4787 
4788  this->mj_weights[i] = allocMemory<mj_scalar_t>(num_incoming_gnos);
4789  ierr = Zoltan_Comm_Do(
4790  plan,
4791  message_tag,
4792  (char *) weight,
4793  sizeof(mj_scalar_t),
4794  (char *) this->mj_weights[i]);
4795  Z2_ASSERT_VALUE(ierr, ZOLTAN_OK);
4796  freeArray<mj_scalar_t>(weight);
4797  }
4798 
4799 
4800  //migrate owners.
4801  int *coord_own = allocMemory<int>(num_incoming_gnos);
4802  message_tag++;
4803  ierr = Zoltan_Comm_Do(
4804  plan,
4805  message_tag,
4806  (char *) this->owner_of_coordinate,
4807  sizeof(int), (char *) coord_own);
4808  Z2_ASSERT_VALUE(ierr, ZOLTAN_OK);
4809  freeArray<int>(this->owner_of_coordinate);
4810  this->owner_of_coordinate = coord_own;
4811 
4812 
4813  //if num procs is less than num parts,
4814  //we need the part assigment arrays as well, since
4815  //there will be multiple parts in processor.
4816  mj_part_t *new_parts = allocMemory<mj_part_t>(num_incoming_gnos);
4817  if(num_procs < num_parts){
4818  message_tag++;
4819  ierr = Zoltan_Comm_Do(
4820  plan,
4821  message_tag,
4822  (char *) this->assigned_part_ids,
4823  sizeof(mj_part_t),
4824  (char *) new_parts);
4825  Z2_ASSERT_VALUE(ierr, ZOLTAN_OK);
4826  }
4827  freeArray<mj_part_t>(this->assigned_part_ids);
4828  this->assigned_part_ids = new_parts;
4829 
4830  ierr = Zoltan_Comm_Destroy(&plan);
4831  Z2_ASSERT_VALUE(ierr, ZOLTAN_OK);
4832  num_new_local_points = num_incoming_gnos;
4833  this->mj_env->timerStop(MACRO_TIMERS, "MultiJagged - Migration Z1Migration-" + iteration);
4834  }
4835 
4836  else
4837 
4838 #endif // ENABLE_ZOLTAN_MIGRATION
4839  {
4840  this->mj_env->timerStart(MACRO_TIMERS, "MultiJagged - Migration DistributorPlanCreating-" + iteration);
4841  Tpetra::Distributor distributor(this->comm);
4842  ArrayView<const mj_part_t> destinations( coordinate_destinations, this->num_local_coords);
4843  mj_lno_t num_incoming_gnos = distributor.createFromSends(destinations);
4844  this->mj_env->timerStop(MACRO_TIMERS, "MultiJagged - Migration DistributorPlanCreating-" + iteration);
4845 
4846  this->mj_env->timerStart(MACRO_TIMERS, "MultiJagged - Migration DistributorMigration-" + iteration);
4847  {
4848  //migrate gnos.
4849  ArrayRCP<mj_gno_t> received_gnos(num_incoming_gnos);
4850  ArrayView<mj_gno_t> sent_gnos(this->current_mj_gnos, this->num_local_coords);
4851  distributor.doPostsAndWaits<mj_gno_t>(sent_gnos, 1, received_gnos());
4852  freeArray<mj_gno_t>(this->current_mj_gnos);
4853  this->current_mj_gnos = allocMemory<mj_gno_t>(num_incoming_gnos);
4854  memcpy(
4855  this->current_mj_gnos,
4856  received_gnos.getRawPtr(),
4857  num_incoming_gnos * sizeof(mj_gno_t));
4858  }
4859  //migrate coordinates
4860  for (int i = 0; i < this->coord_dim; ++i){
4861 
4862  ArrayView<mj_scalar_t> sent_coord(this->mj_coordinates[i], this->num_local_coords);
4863  ArrayRCP<mj_scalar_t> received_coord(num_incoming_gnos);
4864  distributor.doPostsAndWaits<mj_scalar_t>(sent_coord, 1, received_coord());
4865  freeArray<mj_scalar_t>(this->mj_coordinates[i]);
4866  this->mj_coordinates[i] = allocMemory<mj_scalar_t>(num_incoming_gnos);
4867  memcpy(
4868  this->mj_coordinates[i],
4869  received_coord.getRawPtr(),
4870  num_incoming_gnos * sizeof(mj_scalar_t));
4871  }
4872 
4873  //migrate weights.
4874  for (int i = 0; i < this->num_weights_per_coord; ++i){
4875 
4876  ArrayView<mj_scalar_t> sent_weight(this->mj_weights[i], this->num_local_coords);
4877  ArrayRCP<mj_scalar_t> received_weight(num_incoming_gnos);
4878  distributor.doPostsAndWaits<mj_scalar_t>(sent_weight, 1, received_weight());
4879  freeArray<mj_scalar_t>(this->mj_weights[i]);
4880  this->mj_weights[i] = allocMemory<mj_scalar_t>(num_incoming_gnos);
4881  memcpy(
4882  this->mj_weights[i],
4883  received_weight.getRawPtr(),
4884  num_incoming_gnos * sizeof(mj_scalar_t));
4885  }
4886 
4887  {
4888  //migrate the owners of the coordinates
4889  ArrayView<int> sent_owners(this->owner_of_coordinate, this->num_local_coords);
4890  ArrayRCP<int> received_owners(num_incoming_gnos);
4891  distributor.doPostsAndWaits<int>(sent_owners, 1, received_owners());
4892  freeArray<int>(this->owner_of_coordinate);
4893  this->owner_of_coordinate = allocMemory<int>(num_incoming_gnos);
4894  memcpy(
4895  this->owner_of_coordinate,
4896  received_owners.getRawPtr(),
4897  num_incoming_gnos * sizeof(int));
4898  }
4899 
4900  //if num procs is less than num parts,
4901  //we need the part assigment arrays as well, since
4902  //there will be multiple parts in processor.
4903  if(num_procs < num_parts){
4904  ArrayView<mj_part_t> sent_partids(this->assigned_part_ids, this->num_local_coords);
4905  ArrayRCP<mj_part_t> received_partids(num_incoming_gnos);
4906  distributor.doPostsAndWaits<mj_part_t>(sent_partids, 1, received_partids());
4907  freeArray<mj_part_t>(this->assigned_part_ids);
4908  this->assigned_part_ids = allocMemory<mj_part_t>(num_incoming_gnos);
4909  memcpy(
4910  this->assigned_part_ids,
4911  received_partids.getRawPtr(),
4912  num_incoming_gnos * sizeof(mj_part_t));
4913  }
4914  else {
4915  mj_part_t *new_parts = allocMemory<int>(num_incoming_gnos);
4916  freeArray<mj_part_t>(this->assigned_part_ids);
4917  this->assigned_part_ids = new_parts;
4918  }
4919  this->mj_env->timerStop(MACRO_TIMERS, "MultiJagged - Migration DistributorMigration-" + iteration);
4920  num_new_local_points = num_incoming_gnos;
4921 
4922  }
4923 }
4924 
4931 template <typename mj_scalar_t, typename mj_lno_t, typename mj_gno_t,
4932  typename mj_part_t>
4933 void AlgMJ<mj_scalar_t, mj_lno_t, mj_gno_t, mj_part_t>::create_sub_communicator(std::vector<mj_part_t> &processor_ranks_for_subcomm){
4934  mj_part_t group_size = processor_ranks_for_subcomm.size();
4935  mj_part_t *ids = allocMemory<mj_part_t>(group_size);
4936  for(mj_part_t i = 0; i < group_size; ++i) {
4937  ids[i] = processor_ranks_for_subcomm[i];
4938  }
4939  ArrayView<const mj_part_t> idView(ids, group_size);
4940  this->comm = this->comm->createSubcommunicator(idView);
4941  freeArray<mj_part_t>(ids);
4942 }
4943 
4944 
4950 template <typename mj_scalar_t, typename mj_lno_t, typename mj_gno_t,
4951  typename mj_part_t>
4952 void AlgMJ<mj_scalar_t, mj_lno_t, mj_gno_t, mj_part_t>::fill_permutation_array(
4953  mj_part_t output_num_parts,
4954  mj_part_t num_parts){
4955  //if there is single output part, then simply fill the permutation array.
4956  if (output_num_parts == 1){
4957  for(mj_lno_t i = 0; i < this->num_local_coords; ++i){
4958  this->new_coordinate_permutations[i] = i;
4959  }
4960  this->new_part_xadj[0] = this->num_local_coords;
4961  }
4962  else {
4963 
4964  //otherwise we need to count how many points are there in each part.
4965  //we allocate here as num_parts, because the sent partids are up to num_parts,
4966  //although there are outout_num_parts different part.
4967  mj_lno_t *num_points_in_parts = allocMemory<mj_lno_t>(num_parts);
4968  //part shift holds the which part number an old part number corresponds to.
4969  mj_part_t *part_shifts = allocMemory<mj_part_t>(num_parts);
4970 
4971  memset(num_points_in_parts, 0, sizeof(mj_lno_t) * num_parts);
4972 
4973  for(mj_lno_t i = 0; i < this->num_local_coords; ++i){
4974  mj_part_t ii = this->assigned_part_ids[i];
4975  ++num_points_in_parts[ii];
4976  }
4977 
4978  //write the end points of the parts.
4979  mj_part_t p = 0;
4980  mj_lno_t prev_index = 0;
4981  for(mj_part_t i = 0; i < num_parts; ++i){
4982  if(num_points_in_parts[i] > 0) {
4983  this->new_part_xadj[p] = prev_index + num_points_in_parts[i];
4984  prev_index += num_points_in_parts[i];
4985  part_shifts[i] = p++;
4986  }
4987  }
4988 
4989  //for the rest of the parts write the end index as end point.
4990  mj_part_t assigned_num_parts = p - 1;
4991  for (;p < num_parts; ++p){
4992  this->new_part_xadj[p] = this->new_part_xadj[assigned_num_parts];
4993  }
4994  for(mj_part_t i = 0; i < output_num_parts; ++i){
4995  num_points_in_parts[i] = this->new_part_xadj[i];
4996  }
4997 
4998  //write the permutation array here.
4999  //get the part of the coordinate i, shift it to obtain the new part number.
5000  //assign it to the end of the new part numbers pointer.
5001  for(mj_lno_t i = this->num_local_coords - 1; i >= 0; --i){
5002  mj_part_t part = part_shifts[mj_part_t(this->assigned_part_ids[i])];
5003  this->new_coordinate_permutations[--num_points_in_parts[part]] = i;
5004  }
5005 
5006  freeArray<mj_lno_t>(num_points_in_parts);
5007  freeArray<mj_part_t>(part_shifts);
5008  }
5009 }
5010 
5011 
5034 template <typename mj_scalar_t, typename mj_lno_t, typename mj_gno_t,
5035  typename mj_part_t>
5036 bool AlgMJ<mj_scalar_t, mj_lno_t, mj_gno_t, mj_part_t>::mj_perform_migration(
5037  mj_part_t input_num_parts, //current umb parts
5038  mj_part_t &output_num_parts, //output umb parts.
5039  std::vector<mj_part_t> *next_future_num_parts_in_parts,
5040  mj_part_t &output_part_begin_index,
5041  size_t migration_reduce_all_population,
5042  mj_lno_t num_coords_for_last_dim_part,
5043  std::string iteration,
5044  RCP<mj_partBoxVector_t> &input_part_boxes,
5045  RCP<mj_partBoxVector_t> &output_part_boxes
5046 )
5047 {
5048  mj_part_t num_procs = this->comm->getSize();
5049  this->myRank = this->comm->getRank();
5050 
5051 
5052  //this array holds how many points each processor has in each part.
5053  //to access how many points processor i has on part j,
5054  //num_points_in_all_processor_parts[i * num_parts + j]
5055  mj_gno_t *num_points_in_all_processor_parts = allocMemory<mj_gno_t>(input_num_parts * (num_procs + 1));
5056 
5057  //get the number of coordinates in each part in each processor.
5058  this->get_processor_num_points_in_parts(
5059  num_procs,
5060  input_num_parts,
5061  num_points_in_all_processor_parts);
5062 
5063 
5064  //check if migration will be performed or not.
5065  if (!this->mj_check_to_migrate(
5066  migration_reduce_all_population,
5067  num_coords_for_last_dim_part,
5068  num_procs,
5069  input_num_parts,
5070  num_points_in_all_processor_parts)){
5071  freeArray<mj_gno_t>(num_points_in_all_processor_parts);
5072  return false;
5073  }
5074 
5075 
5076  mj_lno_t *send_count_to_each_proc = NULL;
5077  int *coordinate_destinations = allocMemory<int>(this->num_local_coords);
5078  send_count_to_each_proc = allocMemory<mj_lno_t>(num_procs);
5079  for (int i = 0; i < num_procs; ++i) send_count_to_each_proc[i] = 0;
5080 
5081  std::vector<mj_part_t> processor_ranks_for_subcomm;
5082  std::vector<mj_part_t> out_part_indices;
5083 
5084  //determine which processors are assigned to which parts
5085  this->mj_migration_part_proc_assignment(
5086  num_points_in_all_processor_parts,
5087  input_num_parts,
5088  num_procs,
5089  send_count_to_each_proc,
5090  processor_ranks_for_subcomm,
5091  next_future_num_parts_in_parts,
5092  output_num_parts,
5093  out_part_indices,
5094  output_part_begin_index,
5095  coordinate_destinations);
5096 
5097 
5098 
5099 
5100  freeArray<mj_lno_t>(send_count_to_each_proc);
5101  std::vector <mj_part_t> tmpv;
5102 
5103  std::sort (out_part_indices.begin(), out_part_indices.end());
5104  mj_part_t outP = out_part_indices.size();
5105 
5106  mj_gno_t new_global_num_points = 0;
5107  mj_gno_t *global_num_points_in_parts = num_points_in_all_processor_parts + num_procs * input_num_parts;
5108 
5109  if (this->mj_keep_part_boxes){
5110  input_part_boxes->clear();
5111  }
5112 
5113  //now we calculate the new values for next_future_num_parts_in_parts.
5114  //same for the part boxes.
5115  for (mj_part_t i = 0; i < outP; ++i){
5116  mj_part_t ind = out_part_indices[i];
5117  new_global_num_points += global_num_points_in_parts[ind];
5118  tmpv.push_back((*next_future_num_parts_in_parts)[ind]);
5119  if (this->mj_keep_part_boxes){
5120  input_part_boxes->push_back((*output_part_boxes)[ind]);
5121  }
5122  }
5123  //swap the input and output part boxes.
5124  if (this->mj_keep_part_boxes){
5125  RCP<mj_partBoxVector_t> tmpPartBoxes = input_part_boxes;
5126  input_part_boxes = output_part_boxes;
5127  output_part_boxes = tmpPartBoxes;
5128  }
5129  next_future_num_parts_in_parts->clear();
5130  for (mj_part_t i = 0; i < outP; ++i){
5131  mj_part_t p = tmpv[i];
5132  next_future_num_parts_in_parts->push_back(p);
5133  }
5134 
5135  freeArray<mj_gno_t>(num_points_in_all_processor_parts);
5136 
5137  mj_lno_t num_new_local_points = 0;
5138 
5139 
5140  //perform the actual migration operation here.
5141  this->mj_migrate_coords(
5142  num_procs,
5143  num_new_local_points,
5144  iteration,
5145  coordinate_destinations,
5146  input_num_parts);
5147 
5148 
5149  freeArray<int>(coordinate_destinations);
5150 
5151  if(this->num_local_coords != num_new_local_points){
5152  freeArray<mj_lno_t>(this->new_coordinate_permutations);
5153  freeArray<mj_lno_t>(this->coordinate_permutations);
5154 
5155  this->new_coordinate_permutations = allocMemory<mj_lno_t>(num_new_local_points);
5156  this->coordinate_permutations = allocMemory<mj_lno_t>(num_new_local_points);
5157  }
5158  this->num_local_coords = num_new_local_points;
5159  this->num_global_coords = new_global_num_points;
5160 
5161 
5162 
5163  //create subcommunicator.
5164  this->create_sub_communicator(processor_ranks_for_subcomm);
5165  processor_ranks_for_subcomm.clear();
5166 
5167  //fill the new permutation arrays.
5168  this->fill_permutation_array(
5169  output_num_parts,
5170  input_num_parts);
5171  return true;
5172 }
5173 
5174 
5188 template <typename mj_scalar_t, typename mj_lno_t, typename mj_gno_t,
5189  typename mj_part_t>
5190 void AlgMJ<mj_scalar_t, mj_lno_t, mj_gno_t, mj_part_t>::create_consistent_chunks(
5191  mj_part_t num_parts,
5192  mj_scalar_t *mj_current_dim_coords,
5193  mj_scalar_t *current_concurrent_cut_coordinate,
5194  mj_lno_t coordinate_begin,
5195  mj_lno_t coordinate_end,
5196  mj_scalar_t *used_local_cut_line_weight_to_left,
5197  mj_lno_t *out_part_xadj,
5198  int coordInd){
5199 
5200  //mj_lno_t numCoordsInPart = coordinateEnd - coordinateBegin;
5201  mj_part_t no_cuts = num_parts - 1;
5202 
5203 
5204 
5205  int me = 0;
5206  mj_lno_t *thread_num_points_in_parts = this->thread_point_counts[me];
5207  mj_scalar_t *my_local_thread_cut_weights_to_put_left = NULL;
5208 
5209 
5210  //now if the rectilinear partitioning is allowed we decide how
5211  //much weight each thread should put to left and right.
5212  if (this->distribute_points_on_cut_lines){
5213 
5214  my_local_thread_cut_weights_to_put_left = this->thread_cut_line_weight_to_put_left[me];
5215  for (mj_part_t i = 0; i < no_cuts; ++i){
5216  //the left to be put on the left of the cut.
5217  mj_scalar_t left_weight = used_local_cut_line_weight_to_left[i];
5218  //cout << "i:" << i << " left_weight:" << left_weight << endl;
5219  for(int ii = 0; ii < this->num_threads; ++ii){
5220  if(left_weight > this->sEpsilon){
5221  //the weight of thread ii on cut.
5222  mj_scalar_t thread_ii_weight_on_cut = this->thread_part_weight_work[ii][i * 2 + 1] - this->thread_part_weight_work[ii][i * 2 ];
5223  if(thread_ii_weight_on_cut < left_weight){
5224  this->thread_cut_line_weight_to_put_left[ii][i] = thread_ii_weight_on_cut;
5225  }
5226  else {
5227  this->thread_cut_line_weight_to_put_left[ii][i] = left_weight ;
5228  }
5229  left_weight -= thread_ii_weight_on_cut;
5230  }
5231  else {
5232  this->thread_cut_line_weight_to_put_left[ii][i] = 0;
5233  }
5234  }
5235  }
5236 
5237  if(no_cuts > 0){
5238  //this is a special case. If cutlines share the same coordinate, their weights are equal.
5239  //we need to adjust the ratio for that.
5240  for (mj_part_t i = no_cuts - 1; i > 0 ; --i){
5241  if(ZOLTAN2_ABS(current_concurrent_cut_coordinate[i] - current_concurrent_cut_coordinate[i -1]) < this->sEpsilon){
5242  my_local_thread_cut_weights_to_put_left[i] -= my_local_thread_cut_weights_to_put_left[i - 1] ;
5243  }
5244  my_local_thread_cut_weights_to_put_left[i] = int ((my_local_thread_cut_weights_to_put_left[i] + LEAST_SIGNIFICANCE) * SIGNIFICANCE_MUL)
5245  / mj_scalar_t(SIGNIFICANCE_MUL);
5246  }
5247  }
5248  }
5249 
5250  for(mj_part_t ii = 0; ii < num_parts; ++ii){
5251  thread_num_points_in_parts[ii] = 0;
5252  }
5253 
5254  //for this specific case we dont want to distribute the points along the cut position
5255  //randomly, as we need a specific ordering of them. Instead,
5256  //we put the coordinates into a sort item, where we sort those
5257  //using the coordinates of points on other dimensions and the index.
5258 
5259 
5260  //some of the cuts might share the same position.
5261  //in this case, if cut i and cut j share the same position
5262  //cut_map[i] = cut_map[j] = sort item index.
5263  mj_part_t *cut_map = allocMemory<mj_part_t> (no_cuts);
5264 
5265 
5266  typedef uMultiSortItem<mj_lno_t, int, mj_scalar_t> multiSItem;
5267  typedef std::vector< multiSItem > multiSVector;
5268  typedef std::vector<multiSVector> multiS2Vector;
5269 
5270  //to keep track of the memory allocated.
5271  std::vector<mj_scalar_t *>allocated_memory;
5272 
5273  //vector for which the coordinates will be sorted.
5274  multiS2Vector sort_vector_points_on_cut;
5275 
5276  //the number of cuts that have different coordinates.
5277  mj_part_t different_cut_count = 1;
5278  cut_map[0] = 0;
5279 
5280  //now we insert 1 sort vector for all cuts on the different
5281  //positins.if multiple cuts are on the same position, they share sort vectors.
5282  multiSVector tmpMultiSVector;
5283  sort_vector_points_on_cut.push_back(tmpMultiSVector);
5284 
5285  for (mj_part_t i = 1; i < no_cuts ; ++i){
5286  //if cuts share the same cut coordinates
5287  //set the cutmap accordingly.
5288  if(ZOLTAN2_ABS(current_concurrent_cut_coordinate[i] - current_concurrent_cut_coordinate[i -1]) < this->sEpsilon){
5289  cut_map[i] = cut_map[i-1];
5290  }
5291  else {
5292  cut_map[i] = different_cut_count++;
5293  multiSVector tmp2MultiSVector;
5294  sort_vector_points_on_cut.push_back(tmp2MultiSVector);
5295  }
5296  }
5297 
5298 
5299  //now the actual part assigment.
5300  for (mj_lno_t ii = coordinate_begin; ii < coordinate_end; ++ii){
5301 
5302  mj_lno_t i = this->coordinate_permutations[ii];
5303 
5304  mj_part_t pp = this->assigned_part_ids[i];
5305  mj_part_t p = pp / 2;
5306  //if the coordinate is on a cut.
5307  if(pp % 2 == 1 ){
5308  mj_scalar_t *vals = allocMemory<mj_scalar_t>(this->coord_dim -1);
5309  allocated_memory.push_back(vals);
5310 
5311  //we insert the coordinates to the sort item here.
5312  int val_ind = 0;
5313  for(int dim = coordInd + 1; dim < this->coord_dim; ++dim){
5314  vals[val_ind++] = this->mj_coordinates[dim][i];
5315  }
5316  for(int dim = 0; dim < coordInd; ++dim){
5317  vals[val_ind++] = this->mj_coordinates[dim][i];
5318  }
5319  multiSItem tempSortItem(i, this->coord_dim -1, vals);
5320  //inser the point to the sort vector pointed by the cut_map[p].
5321  mj_part_t cmap = cut_map[p];
5322  sort_vector_points_on_cut[cmap].push_back(tempSortItem);
5323  }
5324  else {
5325  //if it is not on the cut, simple sorting.
5326  ++thread_num_points_in_parts[p];
5327  this->assigned_part_ids[i] = p;
5328  }
5329  }
5330 
5331  //sort all the sort vectors.
5332  for (mj_part_t i = 0; i < different_cut_count; ++i){
5333  std::sort (sort_vector_points_on_cut[i].begin(), sort_vector_points_on_cut[i].end());
5334  }
5335 
5336  //we do the part assignment for the points on cuts here.
5337  mj_part_t previous_cut_map = cut_map[0];
5338 
5339  //this is how much previous part owns the weight of the current part.
5340  //when target part weight is 1.6, and the part on the left is given 2,
5341  //the left has an extra 0.4, while the right has missing 0.4 from the previous cut.
5342  //this parameter is used to balance this issues.
5343  //in the above example weight_stolen_from_previous_part will be 0.4.
5344  //if the left part target is 2.2 but it is given 2,
5345  //then weight_stolen_from_previous_part will be -0.2.
5346  mj_scalar_t weight_stolen_from_previous_part = 0;
5347  for (mj_part_t p = 0; p < no_cuts; ++p){
5348 
5349  mj_part_t mapped_cut = cut_map[p];
5350 
5351  //if previous cut map is done, and it does not have the same index,
5352  //then assign all points left on that cut to its right.
5353  if (previous_cut_map != mapped_cut){
5354  mj_lno_t sort_vector_end = (mj_lno_t)sort_vector_points_on_cut[previous_cut_map].size() - 1;
5355  for (; sort_vector_end >= 0; --sort_vector_end){
5356  multiSItem t = sort_vector_points_on_cut[previous_cut_map][sort_vector_end];
5357  mj_lno_t i = t.index;
5358  ++thread_num_points_in_parts[p];
5359  this->assigned_part_ids[i] = p;
5360  }
5361  sort_vector_points_on_cut[previous_cut_map].clear();
5362  }
5363  mj_lno_t sort_vector_end = (mj_lno_t)sort_vector_points_on_cut[mapped_cut].size() - 1;
5364 
5365  for (; sort_vector_end >= 0; --sort_vector_end){
5366  multiSItem t = sort_vector_points_on_cut[mapped_cut][sort_vector_end];
5367  mj_lno_t i = t.index;
5368  mj_scalar_t w = this->mj_uniform_weights[0]? 1:this->mj_weights[0][i];
5369 
5370 
5371  //part p has enough space for point i, then put it to point i.
5372  if( my_local_thread_cut_weights_to_put_left[p] + weight_stolen_from_previous_part> this->sEpsilon &&
5373  my_local_thread_cut_weights_to_put_left[p] + weight_stolen_from_previous_part - ZOLTAN2_ABS(my_local_thread_cut_weights_to_put_left[p] + weight_stolen_from_previous_part - w)
5374  > this->sEpsilon){
5375 
5376  my_local_thread_cut_weights_to_put_left[p] -= w;
5377  sort_vector_points_on_cut[mapped_cut].pop_back();
5378  ++thread_num_points_in_parts[p];
5379  this->assigned_part_ids[i] = p;
5380  //if putting this weight to left overweights the left cut, then
5381  //increase the space for the next cut using weight_stolen_from_previous_part.
5382  if(p < no_cuts - 1 && my_local_thread_cut_weights_to_put_left[p] < this->sEpsilon){
5383  if(mapped_cut == cut_map[p + 1] ){
5384  //if the cut before the cut indexed at p was also at the same position
5385  //special case, as we handle the weight differently here.
5386  if (previous_cut_map != mapped_cut){
5387  weight_stolen_from_previous_part = my_local_thread_cut_weights_to_put_left[p];
5388  }
5389  else {
5390  //if the cut before the cut indexed at p was also at the same position
5391  //we assign extra weights cumulatively in this case.
5392  weight_stolen_from_previous_part += my_local_thread_cut_weights_to_put_left[p];
5393  }
5394  }
5395  else{
5396  weight_stolen_from_previous_part = -my_local_thread_cut_weights_to_put_left[p];
5397  }
5398  //end assignment for part p
5399  break;
5400  }
5401  } else {
5402  //if part p does not have enough space for this point
5403  //and if there is another cut sharing the same positon,
5404  //again increase the space for the next
5405  if(p < no_cuts - 1 && mapped_cut == cut_map[p + 1]){
5406  if (previous_cut_map != mapped_cut){
5407  weight_stolen_from_previous_part = my_local_thread_cut_weights_to_put_left[p];
5408  }
5409  else {
5410  weight_stolen_from_previous_part += my_local_thread_cut_weights_to_put_left[p];
5411  }
5412  }
5413  else{
5414  weight_stolen_from_previous_part = -my_local_thread_cut_weights_to_put_left[p];
5415  }
5416  //end assignment for part p
5417  break;
5418  }
5419  }
5420  previous_cut_map = mapped_cut;
5421  }
5422  //put everything left on the last cut to the last part.
5423  mj_lno_t sort_vector_end = (mj_lno_t)sort_vector_points_on_cut[previous_cut_map].size() - 1;
5424  for (; sort_vector_end >= 0; --sort_vector_end){
5425  multiSItem t = sort_vector_points_on_cut[previous_cut_map][sort_vector_end];
5426  mj_lno_t i = t.index;
5427  ++thread_num_points_in_parts[no_cuts];
5428  this->assigned_part_ids[i] = no_cuts;
5429  }
5430  sort_vector_points_on_cut[previous_cut_map].clear();
5431  freeArray<mj_part_t> (cut_map);
5432 
5433  //free the memory allocated for vertex sort items .
5434  mj_lno_t vSize = (mj_lno_t) allocated_memory.size();
5435  for(mj_lno_t i = 0; i < vSize; ++i){
5436  freeArray<mj_scalar_t> (allocated_memory[i]);
5437  }
5438 
5439  //creation of part_xadj as in usual case.
5440  for(mj_part_t j = 0; j < num_parts; ++j){
5441  mj_lno_t num_points_in_part_j_upto_thread_i = 0;
5442  for (int i = 0; i < this->num_threads; ++i){
5443  mj_lno_t thread_num_points_in_part_j = this->thread_point_counts[i][j];
5444  this->thread_point_counts[i][j] = num_points_in_part_j_upto_thread_i;
5445  num_points_in_part_j_upto_thread_i += thread_num_points_in_part_j;
5446 
5447  }
5448  out_part_xadj[j] = num_points_in_part_j_upto_thread_i;// + prev2; //+ coordinateBegin;
5449  }
5450 
5451  //perform prefix sum for num_points in parts.
5452  for(mj_part_t j = 1; j < num_parts; ++j){
5453  out_part_xadj[j] += out_part_xadj[j - 1];
5454  }
5455 
5456 
5457  //shift the num points in threads thread to obtain the
5458  //beginning index of each thread's private space.
5459  for(mj_part_t j = 1; j < num_parts; ++j){
5460  thread_num_points_in_parts[j] += out_part_xadj[j - 1] ;
5461  }
5462 
5463  //now thread gets the coordinate and writes the index of coordinate to the permutation array
5464  //using the part index we calculated.
5465  for (mj_lno_t ii = coordinate_begin; ii < coordinate_end; ++ii){
5466  mj_lno_t i = this->coordinate_permutations[ii];
5467  mj_part_t p = this->assigned_part_ids[i];
5468  this->new_coordinate_permutations[coordinate_begin +
5469  thread_num_points_in_parts[p]++] = i;
5470  }
5471 }
5472 
5473 
5474 
5484 template <typename mj_scalar_t, typename mj_lno_t, typename mj_gno_t,
5485  typename mj_part_t>
5486 void AlgMJ<mj_scalar_t, mj_lno_t, mj_gno_t, mj_part_t>::set_final_parts(
5487  mj_part_t current_num_parts,
5488  mj_part_t output_part_begin_index,
5489  RCP<mj_partBoxVector_t> &output_part_boxes,
5490  bool is_data_ever_migrated)
5491 {
5492  this->mj_env->timerStart(MACRO_TIMERS, "MultiJagged - Part_Assignment");
5493 
5494 #ifdef HAVE_ZOLTAN2_OMP
5495 #pragma omp parallel for
5496 #endif
5497  for(mj_part_t i = 0; i < current_num_parts;++i){
5498 
5499  mj_lno_t begin = 0;
5500  mj_lno_t end = this->part_xadj[i];
5501 
5502  if(i > 0) begin = this->part_xadj[i -1];
5503  mj_part_t part_to_set_index = i + output_part_begin_index;
5504  if (this->mj_keep_part_boxes){
5505  (*output_part_boxes)[i].setpId(part_to_set_index);
5506  }
5507  for (mj_lno_t ii = begin; ii < end; ++ii){
5508  mj_lno_t k = this->coordinate_permutations[ii];
5509  this->assigned_part_ids[k] = part_to_set_index;
5510  }
5511  }
5512 
5513  //ArrayRCP<const mj_gno_t> gnoList;
5514  if(!is_data_ever_migrated){
5515  //freeArray<mj_gno_t>(this->current_mj_gnos);
5516  //if(this->num_local_coords > 0){
5517  // gnoList = arcpFromArrayView(this->mj_gnos);
5518  //}
5519  }
5520  else {
5521 #ifdef ENABLE_ZOLTAN_MIGRATION
5522  if (sizeof(mj_lno_t) <= sizeof(int)) {
5523 
5524  // Cannot use Zoltan_Comm with local ordinals larger than ints.
5525  // In Zoltan_Comm_Create, the cast int(this->num_local_coords)
5526  // may overflow.
5527 
5528  //if data is migrated, then send part numbers to the original owners.
5529  ZOLTAN_COMM_OBJ *plan = NULL;
5530  MPI_Comm mpi_comm = Teuchos::getRawMpiComm(*(this->mj_problemComm));
5531 
5532  int incoming = 0;
5533  int message_tag = 7856;
5534 
5535  this->mj_env->timerStart(MACRO_TIMERS, "MultiJagged - Final Z1PlanCreating");
5536  int ierr = Zoltan_Comm_Create( &plan, int(this->num_local_coords),
5537  this->owner_of_coordinate, mpi_comm, message_tag,
5538  &incoming);
5539  Z2_ASSERT_VALUE(ierr, ZOLTAN_OK);
5540  this->mj_env->timerStop(MACRO_TIMERS, "MultiJagged - Final Z1PlanCreating" );
5541 
5542  mj_gno_t *incoming_gnos = allocMemory< mj_gno_t>(incoming);
5543 
5544  message_tag++;
5545  this->mj_env->timerStart(MACRO_TIMERS, "MultiJagged - Final Z1PlanComm");
5546  ierr = Zoltan_Comm_Do( plan, message_tag, (char *) this->current_mj_gnos,
5547  sizeof(mj_gno_t), (char *) incoming_gnos);
5548  Z2_ASSERT_VALUE(ierr, ZOLTAN_OK);
5549 
5550  freeArray<mj_gno_t>(this->current_mj_gnos);
5551  this->current_mj_gnos = incoming_gnos;
5552 
5553  mj_part_t *incoming_partIds = allocMemory< mj_part_t>(incoming);
5554 
5555  message_tag++;
5556  ierr = Zoltan_Comm_Do( plan, message_tag, (char *) this->assigned_part_ids,
5557  sizeof(mj_part_t), (char *) incoming_partIds);
5558  Z2_ASSERT_VALUE(ierr, ZOLTAN_OK);
5559  freeArray<mj_part_t>(this->assigned_part_ids);
5560  this->assigned_part_ids = incoming_partIds;
5561 
5562  this->mj_env->timerStop(MACRO_TIMERS, "MultiJagged - Final Z1PlanComm");
5563  ierr = Zoltan_Comm_Destroy(&plan);
5564  Z2_ASSERT_VALUE(ierr, ZOLTAN_OK);
5565 
5566  this->num_local_coords = incoming;
5567  //gnoList = arcp(this->current_mj_gnos, 0, this->num_local_coords, true);
5568  }
5569  else
5570 
5571 #endif // !ENABLE_ZOLTAN_MIGRATION
5572  {
5573  //if data is migrated, then send part numbers to the original owners.
5574  this->mj_env->timerStart(MACRO_TIMERS, "MultiJagged - Final DistributorPlanCreating");
5575  Tpetra::Distributor distributor(this->mj_problemComm);
5576  ArrayView<const mj_part_t> owners_of_coords(this->owner_of_coordinate, this->num_local_coords);
5577  mj_lno_t incoming = distributor.createFromSends(owners_of_coords);
5578  this->mj_env->timerStop(MACRO_TIMERS, "MultiJagged - Final DistributorPlanCreating" );
5579 
5580  this->mj_env->timerStart(MACRO_TIMERS, "MultiJagged - Final DistributorPlanComm");
5581  //migrate gnos to actual owners.
5582  ArrayRCP<mj_gno_t> received_gnos(incoming);
5583  ArrayView<mj_gno_t> sent_gnos(this->current_mj_gnos, this->num_local_coords);
5584  distributor.doPostsAndWaits<mj_gno_t>(sent_gnos, 1, received_gnos());
5585  freeArray<mj_gno_t>(this->current_mj_gnos);
5586  this->current_mj_gnos = allocMemory<mj_gno_t>(incoming);
5587  memcpy( this->current_mj_gnos,
5588  received_gnos.getRawPtr(),
5589  incoming * sizeof(mj_gno_t));
5590 
5591  //migrate part ids to actual owners.
5592  ArrayView<mj_part_t> sent_partids(this->assigned_part_ids, this->num_local_coords);
5593  ArrayRCP<mj_part_t> received_partids(incoming);
5594  distributor.doPostsAndWaits<mj_part_t>(sent_partids, 1, received_partids());
5595  freeArray<mj_part_t>(this->assigned_part_ids);
5596  this->assigned_part_ids = allocMemory<mj_part_t>(incoming);
5597  memcpy( this->assigned_part_ids,
5598  received_partids.getRawPtr(),
5599  incoming * sizeof(mj_part_t));
5600 
5601  this->num_local_coords = incoming;
5602  this->mj_env->timerStop(MACRO_TIMERS, "MultiJagged - Final DistributorPlanComm");
5603 
5604  }
5605  }
5606 
5607  this->mj_env->timerStop(MACRO_TIMERS, "MultiJagged - Part_Assignment");
5608 
5609  this->mj_env->timerStart(MACRO_TIMERS, "MultiJagged - Solution_Part_Assignment");
5610 
5611  //ArrayRCP<mj_part_t> partId;
5612  //partId = arcp(this->assigned_part_ids, 0, this->num_local_coords, true);
5613 
5614  if (this->mj_keep_part_boxes){
5615  this->kept_boxes = compute_global_box_boundaries(output_part_boxes);
5616 
5617  }
5618 
5619  this->mj_env->timerStop(MACRO_TIMERS, "MultiJagged - Solution_Part_Assignment");
5620 }
5621 
5624 template <typename mj_scalar_t, typename mj_lno_t, typename mj_gno_t,
5625  typename mj_part_t>
5626 void AlgMJ<mj_scalar_t, mj_lno_t, mj_gno_t, mj_part_t>::free_work_memory(){
5627  this->mj_env->timerStart(MACRO_TIMERS, "MultiJagged - Problem_Free");
5628 
5629  for (int i=0; i < this->coord_dim; i++){
5630  freeArray<mj_scalar_t>(this->mj_coordinates[i]);
5631  }
5632  freeArray<mj_scalar_t *>(this->mj_coordinates);
5633 
5634  for (int i=0; i < this->num_weights_per_coord; i++){
5635  freeArray<mj_scalar_t>(this->mj_weights[i]);
5636  }
5637  freeArray<mj_scalar_t *>(this->mj_weights);
5638 
5639  freeArray<int>(this->owner_of_coordinate);
5640 
5641  for(int i = 0; i < this->num_threads; ++i){
5642  freeArray<mj_lno_t>(this->thread_point_counts[i]);
5643  }
5644 
5645  freeArray<mj_lno_t *>(this->thread_point_counts);
5646  freeArray<double *> (this->thread_part_weight_work);
5647 
5648  if(this->distribute_points_on_cut_lines){
5649  freeArray<mj_scalar_t>(this->process_cut_line_weight_to_put_left);
5650  for(int i = 0; i < this->num_threads; ++i){
5651  freeArray<mj_scalar_t>(this->thread_cut_line_weight_to_put_left[i]);
5652  }
5653  freeArray<mj_scalar_t *>(this->thread_cut_line_weight_to_put_left);
5654  freeArray<mj_scalar_t>(this->process_rectilinear_cut_weight);
5655  freeArray<mj_scalar_t>(this->global_rectilinear_cut_weight);
5656  }
5657 
5658  freeArray<mj_part_t>(this->my_incomplete_cut_count);
5659 
5660  freeArray<mj_scalar_t>(this->max_min_coords);
5661 
5662  freeArray<mj_lno_t>(this->part_xadj);
5663 
5664  freeArray<mj_lno_t>(this->coordinate_permutations);
5665 
5666  freeArray<mj_lno_t>(this->new_coordinate_permutations);
5667 
5668  freeArray<mj_scalar_t>(this->all_cut_coordinates);
5669 
5670  freeArray<mj_scalar_t> (this->process_local_min_max_coord_total_weight);
5671 
5672  freeArray<mj_scalar_t> (this->global_min_max_coord_total_weight);
5673 
5674  freeArray<mj_scalar_t>(this->cut_coordinates_work_array);
5675 
5676  freeArray<mj_scalar_t>(this->target_part_weights);
5677 
5678  freeArray<mj_scalar_t>(this->cut_upper_bound_coordinates);
5679 
5680  freeArray<mj_scalar_t>(this->cut_lower_bound_coordinates);
5681 
5682  freeArray<mj_scalar_t>(this->cut_lower_bound_weights);
5683  freeArray<mj_scalar_t>(this->cut_upper_bound_weights);
5684  freeArray<bool>(this->is_cut_line_determined);
5685  freeArray<mj_scalar_t>(this->total_part_weight_left_right_closests);
5686  freeArray<mj_scalar_t>(this->global_total_part_weight_left_right_closests);
5687 
5688  for(int i = 0; i < this->num_threads; ++i){
5689  freeArray<double>(this->thread_part_weights[i]);
5690  freeArray<mj_scalar_t>(this->thread_cut_right_closest_point[i]);
5691  freeArray<mj_scalar_t>(this->thread_cut_left_closest_point[i]);
5692  }
5693 
5694  freeArray<double *>(this->thread_part_weights);
5695  freeArray<mj_scalar_t *>(this->thread_cut_left_closest_point);
5696  freeArray<mj_scalar_t *>(this->thread_cut_right_closest_point);
5697 
5698  this->mj_env->timerStop(MACRO_TIMERS, "MultiJagged - Problem_Free");
5699 }
5700 
5708 template <typename mj_scalar_t, typename mj_lno_t, typename mj_gno_t,
5709  typename mj_part_t>
5711  bool distribute_points_on_cut_lines_,
5712  int max_concurrent_part_calculation_,
5713  int check_migrate_avoid_migration_option_,
5714  mj_scalar_t minimum_migration_imbalance_){
5715  this->distribute_points_on_cut_lines = distribute_points_on_cut_lines_;
5716  this->max_concurrent_part_calculation = max_concurrent_part_calculation_;
5717  this->check_migrate_avoid_migration_option = check_migrate_avoid_migration_option_;
5718  this->minimum_migration_imbalance = minimum_migration_imbalance_;
5719 
5720 }
5721 
5722 
5751 template <typename mj_scalar_t, typename mj_lno_t, typename mj_gno_t,
5752  typename mj_part_t>
5754 
5755  const RCP<const Environment> &env,
5756  RCP<const Comm<int> > &problemComm,
5757 
5758  double imbalance_tolerance_,
5759  size_t num_global_parts_,
5760  mj_part_t *part_no_array_,
5761  int recursion_depth_,
5762 
5763  int coord_dim_,
5764  mj_lno_t num_local_coords_,
5765  mj_gno_t num_global_coords_,
5766  const mj_gno_t *initial_mj_gnos_,
5767  mj_scalar_t **mj_coordinates_,
5768 
5769  int num_weights_per_coord_,
5770  bool *mj_uniform_weights_,
5771  mj_scalar_t **mj_weights_,
5772  bool *mj_uniform_parts_,
5773  mj_scalar_t **mj_part_sizes_,
5774 
5775  mj_part_t *&result_assigned_part_ids_,
5776  mj_gno_t *&result_mj_gnos_
5777 )
5778 {
5779 
5780 #ifdef print_debug
5781  if(comm->getRank() == 0){
5782  std::cout << "size of gno:" << sizeof(mj_gno_t) << std::endl;
5783  std::cout << "size of lno:" << sizeof(mj_lno_t) << std::endl;
5784  std::cout << "size of mj_scalar_t:" << sizeof(mj_scalar_t) << std::endl;
5785  }
5786 #endif
5787 
5788  this->mj_env = env;
5789  this->mj_problemComm = problemComm;
5790  this->myActualRank = this->myRank = this->mj_problemComm->getRank();
5791 
5792  this->mj_env->timerStart(MACRO_TIMERS, "MultiJagged - Total");
5793  this->mj_env->debug(3, "In MultiJagged Jagged");
5794 
5795  {
5796  this->imbalance_tolerance = imbalance_tolerance_;
5797  this->num_global_parts = num_global_parts_;
5798  this->part_no_array = part_no_array_;
5799  this->recursion_depth = recursion_depth_;
5800 
5801  this->coord_dim = coord_dim_;
5802  this->num_local_coords = num_local_coords_;
5803  this->num_global_coords = num_global_coords_;
5804  this->mj_coordinates = mj_coordinates_; //will copy the memory to this->mj_coordinates.
5805  this->initial_mj_gnos = (mj_gno_t *) initial_mj_gnos_; //will copy the memory to this->current_mj_gnos[j].
5806 
5807  this->num_weights_per_coord = num_weights_per_coord_;
5808  this->mj_uniform_weights = mj_uniform_weights_;
5809  this->mj_weights = mj_weights_; //will copy the memory to this->mj_weights
5810  this->mj_uniform_parts = mj_uniform_parts_;
5811  this->mj_part_sizes = mj_part_sizes_;
5812 
5813  this->num_threads = 1;
5814 #ifdef HAVE_ZOLTAN2_OMP
5815 #pragma omp parallel
5816 
5817  {
5818  this->num_threads = omp_get_num_threads();
5819  }
5820 #endif
5821  }
5822  //this->set_input_data();
5823  this->set_part_specifications();
5824 
5825  this->allocate_set_work_memory();
5826 
5827  //We duplicate the comm as we create subcommunicators during migration.
5828  //We keep the problemComm as it is, while comm changes after each migration.
5829  this->comm = this->mj_problemComm->duplicate();
5830 
5831  //initially there is a single partition
5832  mj_part_t current_num_parts = 1;
5833  mj_scalar_t *current_cut_coordinates = this->all_cut_coordinates;
5834 
5835  this->mj_env->timerStart(MACRO_TIMERS, "MultiJagged - Problem_Partitioning");
5836 
5837  mj_part_t output_part_begin_index = 0;
5838  mj_part_t future_num_parts = this->total_num_part;
5839  bool is_data_ever_migrated = false;
5840 
5841  std::vector<mj_part_t> *future_num_part_in_parts = new std::vector<mj_part_t> ();
5842  std::vector<mj_part_t> *next_future_num_parts_in_parts = new std::vector<mj_part_t> ();
5843  next_future_num_parts_in_parts->push_back(this->num_global_parts);
5844 
5845  RCP<mj_partBoxVector_t> input_part_boxes(new mj_partBoxVector_t(), true) ;
5846  RCP<mj_partBoxVector_t> output_part_boxes(new mj_partBoxVector_t(), true);
5847 
5848  compute_global_box();
5849  if(this->mj_keep_part_boxes){
5850  this->init_part_boxes(output_part_boxes);
5851  }
5852 
5853  for (int i = 0; i < this->recursion_depth; ++i){
5854  //partitioning array. size will be as the number of current partitions and this
5855  //holds how many parts that each part will be in the current dimension partitioning.
5856  std::vector <mj_part_t> num_partitioning_in_current_dim;
5857 
5858  //number of parts that will be obtained at the end of this partitioning.
5859  //future_num_part_in_parts is as the size of current number of parts.
5860  //holds how many more parts each should be divided in the further
5861  //iterations. this will be used to calculate num_partitioning_in_current_dim,
5862  //as the number of parts that the part will be partitioned
5863  //in the current dimension partitioning.
5864 
5865  //next_future_num_parts_in_parts will be as the size of outnumParts,
5866  //and this will hold how many more parts that each output part
5867  //should be divided. this array will also be used to determine the weight ratios
5868  //of the parts.
5869  //swap the arrays to use iteratively..
5870  std::vector<mj_part_t> *tmpPartVect= future_num_part_in_parts;
5871  future_num_part_in_parts = next_future_num_parts_in_parts;
5872  next_future_num_parts_in_parts = tmpPartVect;
5873 
5874  //clear next_future_num_parts_in_parts array as
5875  //getPartitionArrays expects it to be empty.
5876  //it also expects num_partitioning_in_current_dim to be empty as well.
5877  next_future_num_parts_in_parts->clear();
5878 
5879  if(this->mj_keep_part_boxes){
5880  RCP<mj_partBoxVector_t> tmpPartBoxes = input_part_boxes;
5881  input_part_boxes = output_part_boxes;
5882  output_part_boxes = tmpPartBoxes;
5883  output_part_boxes->clear();
5884  }
5885 
5886  //returns the total no. of output parts for this dimension partitioning.
5887  mj_part_t output_part_count_in_dimension =
5888  this->update_part_num_arrays(
5889  num_partitioning_in_current_dim,
5890  future_num_part_in_parts,
5891  next_future_num_parts_in_parts,
5892  future_num_parts,
5893  current_num_parts,
5894  i,
5895  input_part_boxes,
5896  output_part_boxes);
5897 
5898  //if the number of obtained parts equal to current number of parts,
5899  //skip this dimension. For example, this happens when 1 is given in the input
5900  //part array is given. P=4,5,1,2
5901  if(output_part_count_in_dimension == current_num_parts) {
5902  //still need to swap the input output arrays.
5903  tmpPartVect= future_num_part_in_parts;
5904  future_num_part_in_parts = next_future_num_parts_in_parts;
5905  next_future_num_parts_in_parts = tmpPartVect;
5906 
5907  if(this->mj_keep_part_boxes){
5908  RCP<mj_partBoxVector_t> tmpPartBoxes = input_part_boxes;
5909  input_part_boxes = output_part_boxes;
5910  output_part_boxes = tmpPartBoxes;
5911  }
5912  continue;
5913  }
5914 
5915 
5916  //get the coordinate axis along which the partitioning will be done.
5917  int coordInd = i % this->coord_dim;
5918  mj_scalar_t * mj_current_dim_coords = this->mj_coordinates[coordInd];
5919 
5920  //convert i to string to be used for debugging purposes.
5921  std::string istring = Teuchos::toString<int>(i);
5922  this->mj_env->timerStart(MACRO_TIMERS, "MultiJagged - Problem_Partitioning_" + istring);
5923 
5924  //alloc Memory to point the indices
5925  //of the parts in the permutation array.
5926  this->new_part_xadj = allocMemory<mj_lno_t>(output_part_count_in_dimension);
5927 
5928  //the index where in the new_part_xadj will be written.
5929  mj_part_t output_part_index = 0;
5930  //whatever is written to output_part_index will be added with putput_coordinate_end_index
5931  //so that the points will be shifted.
5932  mj_part_t output_coordinate_end_index = 0;
5933 
5934  mj_part_t current_work_part = 0;
5935  mj_part_t current_concurrent_num_parts =
5936  std::min(current_num_parts - current_work_part, this->max_concurrent_part_calculation);
5937 
5938  mj_part_t obtained_part_index = 0;
5939 
5940  //run for all available parts.
5941  for (; current_work_part < current_num_parts;
5942  current_work_part += current_concurrent_num_parts){
5943 
5944  current_concurrent_num_parts = std::min(current_num_parts - current_work_part,
5945  this->max_concurrent_part_calculation);
5946 
5947  mj_part_t actual_work_part_count = 0;
5948  //initialization for 1D partitioning.
5949  //get the min and max coordinates of each part
5950  //together with the part weights of each part.
5951  for(int kk = 0; kk < current_concurrent_num_parts; ++kk){
5952  mj_part_t current_work_part_in_concurrent_parts = current_work_part + kk;
5953 
5954  //if this part wont be partitioned any further
5955  //dont do any work for this part.
5956  if (num_partitioning_in_current_dim[current_work_part_in_concurrent_parts] == 1){
5957  continue;
5958  }
5959  ++actual_work_part_count;
5960  mj_lno_t coordinate_end_index= this->part_xadj[current_work_part_in_concurrent_parts];
5961  mj_lno_t coordinate_begin_index = current_work_part_in_concurrent_parts==0 ? 0: this->part_xadj[current_work_part_in_concurrent_parts -1];
5962 
5963 /*
5964  cout << "i:" << i << " j:" << current_work_part + kk
5965  << " coordinate_begin_index:" << coordinate_begin_index
5966  << " coordinate_end_index:" << coordinate_end_index
5967  << " total:" << coordinate_end_index - coordinate_begin_index<< endl;
5968  */
5969  this->mj_get_local_min_max_coord_totW(
5970  coordinate_begin_index,
5971  coordinate_end_index,
5972  this->coordinate_permutations,
5973  mj_current_dim_coords,
5974  this->process_local_min_max_coord_total_weight[kk], //min_coordinate
5975  this->process_local_min_max_coord_total_weight[kk + current_concurrent_num_parts], //max_coordinate
5976  this->process_local_min_max_coord_total_weight[kk + 2*current_concurrent_num_parts]); //total_weight
5977 
5978  }
5979 
5980  //1D partitioning
5981  if (actual_work_part_count > 0){
5982  //obtain global Min max of the part.
5983  this->mj_get_global_min_max_coord_totW(
5984  current_concurrent_num_parts,
5985  this->process_local_min_max_coord_total_weight,
5986  this->global_min_max_coord_total_weight);
5987 
5988  //represents the total number of cutlines
5989  //whose coordinate should be determined.
5990  mj_part_t total_incomplete_cut_count = 0;
5991 
5992  //Compute weight ratios for parts & cuts:
5993  //e.g., 0.25 0.25 0.5 0.5 0.75 0.75 1
5994  //part0 cut0 part1 cut1 part2 cut2 part3
5995  mj_part_t concurrent_part_cut_shift = 0;
5996  mj_part_t concurrent_part_part_shift = 0;
5997  for(int kk = 0; kk < current_concurrent_num_parts; ++kk){
5998  mj_scalar_t min_coordinate = this->global_min_max_coord_total_weight[kk];
5999  mj_scalar_t max_coordinate = this->global_min_max_coord_total_weight[kk +
6000  current_concurrent_num_parts];
6001 
6002  mj_scalar_t global_total_weight = this->global_min_max_coord_total_weight[kk +
6003  2 * current_concurrent_num_parts];
6004 
6005  mj_part_t concurrent_current_part_index = current_work_part + kk;
6006 
6007  mj_part_t partition_count = num_partitioning_in_current_dim[concurrent_current_part_index];
6008 
6009  mj_scalar_t *usedCutCoordinate = current_cut_coordinates + concurrent_part_cut_shift;
6010  mj_scalar_t *current_target_part_weights = this->target_part_weights +
6011  concurrent_part_part_shift;
6012  //shift the usedCutCoordinate array as noCuts.
6013  concurrent_part_cut_shift += partition_count - 1;
6014  //shift the partRatio array as noParts.
6015  concurrent_part_part_shift += partition_count;
6016 
6017 
6018  //calculate only if part is not empty,
6019  //and part will be further partitioned.
6020  if(partition_count > 1 && min_coordinate <= max_coordinate){
6021 
6022  //increase num_cuts_do_be_determined by the number of cuts of the current
6023  //part's cut line number.
6024  total_incomplete_cut_count += partition_count - 1;
6025  //set the number of cut lines that should be determined
6026  //for this part.
6027  this->my_incomplete_cut_count[kk] = partition_count - 1;
6028 
6029  //get the target weights of the parts.
6030  this->mj_get_initial_cut_coords_target_weights(
6031  min_coordinate,
6032  max_coordinate,
6033  partition_count - 1,
6034  global_total_weight,
6035  usedCutCoordinate,
6036  current_target_part_weights,
6037  future_num_part_in_parts,
6038  next_future_num_parts_in_parts,
6039  concurrent_current_part_index,
6040  obtained_part_index);
6041 
6042  mj_lno_t coordinate_end_index= this->part_xadj[concurrent_current_part_index];
6043  mj_lno_t coordinate_begin_index = concurrent_current_part_index==0 ? 0: this->part_xadj[concurrent_current_part_index -1];
6044 
6045  //get the initial estimated part assignments of the
6046  //coordinates.
6047  this->set_initial_coordinate_parts(
6048  max_coordinate,
6049  min_coordinate,
6050  concurrent_current_part_index,
6051  coordinate_begin_index, coordinate_end_index,
6052  this->coordinate_permutations,
6053  mj_current_dim_coords,
6054  this->assigned_part_ids,
6055  partition_count);
6056  }
6057  else {
6058  // e.g., if have fewer coordinates than parts, don't need to do next dim.
6059  this->my_incomplete_cut_count[kk] = 0;
6060  }
6061  obtained_part_index += partition_count;
6062  }
6063 
6064 
6065 
6066  //used imbalance, it is always 0, as it is difficult to
6067  //estimate a range.
6068  mj_scalar_t used_imbalance = 0;
6069 
6070 
6071  // Determine cut lines for all concurrent parts parts here.
6072  this->mj_1D_part(
6073  mj_current_dim_coords,
6074  used_imbalance,
6075  current_work_part,
6076  current_concurrent_num_parts,
6077  current_cut_coordinates,
6078  total_incomplete_cut_count,
6079  num_partitioning_in_current_dim);
6080  }
6081 
6082  //create new part chunks
6083  {
6084  mj_part_t output_array_shift = 0;
6085  mj_part_t cut_shift = 0;
6086  size_t tlr_shift = 0;
6087  size_t partweight_array_shift = 0;
6088 
6089  for(int kk = 0; kk < current_concurrent_num_parts; ++kk){
6090  mj_part_t current_concurrent_work_part = current_work_part + kk;
6091  mj_part_t num_parts = num_partitioning_in_current_dim[current_concurrent_work_part];
6092 
6093  //if the part is empty, skip the part.
6094  if((num_parts != 1 )
6095  &&
6096  this->global_min_max_coord_total_weight[kk] >
6097  this->global_min_max_coord_total_weight[kk + current_concurrent_num_parts]) {
6098 
6099  //we still need to write the begin and end point of the
6100  //empty part. simply set it zero, the array indices will be shifted later.
6101  for(mj_part_t jj = 0; jj < num_parts; ++jj){
6102  this->new_part_xadj[output_part_index + output_array_shift + jj] = 0;
6103  }
6104  cut_shift += num_parts - 1;
6105  tlr_shift += (4 *(num_parts - 1) + 1);
6106  output_array_shift += num_parts;
6107  partweight_array_shift += (2 * (num_parts - 1) + 1);
6108  continue;
6109  }
6110 
6111  mj_lno_t coordinate_end= this->part_xadj[current_concurrent_work_part];
6112  mj_lno_t coordinate_begin = current_concurrent_work_part==0 ? 0: this->part_xadj[
6113  current_concurrent_work_part -1];
6114  mj_scalar_t *current_concurrent_cut_coordinate = current_cut_coordinates + cut_shift;
6115  mj_scalar_t *used_local_cut_line_weight_to_left = this->process_cut_line_weight_to_put_left +
6116  cut_shift;
6117 
6118  //mj_scalar_t *used_tlr_array = this->total_part_weight_left_right_closests + tlr_shift;
6119 
6120  for(int ii = 0; ii < this->num_threads; ++ii){
6121  this->thread_part_weight_work[ii] = this->thread_part_weights[ii] + partweight_array_shift;
6122  }
6123 
6124  if(num_parts > 1){
6125  if(this->mj_keep_part_boxes){
6126  //if part boxes are to be stored update the boundaries.
6127  for (mj_part_t j = 0; j < num_parts - 1; ++j){
6128  (*output_part_boxes)[output_array_shift + output_part_index +
6129  j].updateMinMax(current_concurrent_cut_coordinate[j], 1
6130  /*update max*/, coordInd);
6131 
6132  (*output_part_boxes)[output_array_shift + output_part_index + j +
6133  1].updateMinMax(current_concurrent_cut_coordinate[j], 0
6134  /*update min*/, coordInd);
6135  }
6136  }
6137 
6138  // Rewrite the indices based on the computed cuts.
6139  this->mj_create_new_partitions(
6140  num_parts,
6141  mj_current_dim_coords,
6142  current_concurrent_cut_coordinate,
6143  coordinate_begin,
6144  coordinate_end,
6145  used_local_cut_line_weight_to_left,
6146  this->thread_part_weight_work,
6147  this->new_part_xadj + output_part_index + output_array_shift
6148  );
6149 
6150  }
6151  else {
6152  //if this part is partitioned into 1 then just copy
6153  //the old values.
6154  mj_lno_t part_size = coordinate_end - coordinate_begin;
6155  *(this->new_part_xadj + output_part_index + output_array_shift) = part_size;
6156  memcpy(
6157  this->new_coordinate_permutations + coordinate_begin,
6158  this->coordinate_permutations + coordinate_begin,
6159  part_size * sizeof(mj_lno_t));
6160  }
6161  cut_shift += num_parts - 1;
6162  tlr_shift += (4 *(num_parts - 1) + 1);
6163  output_array_shift += num_parts;
6164  partweight_array_shift += (2 * (num_parts - 1) + 1);
6165  }
6166 
6167  //shift cut coordinates so that all cut coordinates are stored.
6168  //no shift now because we dont keep the cuts.
6169  //current_cut_coordinates += cut_shift;
6170 
6171  //mj_create_new_partitions from coordinates partitioned the parts and
6172  //write the indices as if there were a single part.
6173  //now we need to shift the beginning indices.
6174  for(mj_part_t kk = 0; kk < current_concurrent_num_parts; ++kk){
6175  mj_part_t num_parts = num_partitioning_in_current_dim[ current_work_part + kk];
6176  for (mj_part_t ii = 0;ii < num_parts ; ++ii){
6177  //shift it by previousCount
6178  this->new_part_xadj[output_part_index+ii] += output_coordinate_end_index;
6179  }
6180  //increase the previous count by current end.
6181  output_coordinate_end_index = this->new_part_xadj[output_part_index + num_parts - 1];
6182  //increase the current out.
6183  output_part_index += num_parts ;
6184  }
6185  }
6186  }
6187  // end of this partitioning dimension
6188 
6189 
6190  int current_world_size = this->comm->getSize();
6191  long migration_reduce_all_population = this->total_dim_num_reduce_all * current_world_size;
6192 
6193 
6194  bool is_migrated_in_current_dimension = false;
6195 
6196  //we migrate if there are more partitionings to be done after this step
6197  //and if the migration is not forced to be avoided.
6198  //and the operation is not sequential.
6199  if (future_num_parts > 1 &&
6200  this->check_migrate_avoid_migration_option >= 0 &&
6201  current_world_size > 1){
6202 
6203  this->mj_env->timerStart(MACRO_TIMERS, "MultiJagged - Problem_Migration-" + istring);
6204  mj_part_t num_parts = output_part_count_in_dimension;
6205  if ( this->mj_perform_migration(
6206  num_parts,
6207  current_num_parts, //output
6208  next_future_num_parts_in_parts, //output
6209  output_part_begin_index,
6210  migration_reduce_all_population,
6211  this->num_local_coords / (future_num_parts * current_num_parts),
6212  istring,
6213  input_part_boxes, output_part_boxes) ) {
6214  is_migrated_in_current_dimension = true;
6215  is_data_ever_migrated = true;
6216  this->mj_env->timerStop(MACRO_TIMERS, "MultiJagged - Problem_Migration-" +
6217  istring);
6218  //since data is migrated, we reduce the number of reduceAll operations for the last part.
6219  this->total_dim_num_reduce_all /= num_parts;
6220  }
6221  else {
6222  is_migrated_in_current_dimension = false;
6223  this->mj_env->timerStop(MACRO_TIMERS, "MultiJagged - Problem_Migration-" + istring);
6224  }
6225  }
6226 
6227  //swap the coordinate permutations for the next dimension.
6228  mj_lno_t * tmp = this->coordinate_permutations;
6229  this->coordinate_permutations = this->new_coordinate_permutations;
6230  this->new_coordinate_permutations = tmp;
6231 
6232  if(!is_migrated_in_current_dimension){
6233  this->total_dim_num_reduce_all -= current_num_parts;
6234  current_num_parts = output_part_count_in_dimension;
6235  }
6236  freeArray<mj_lno_t>(this->part_xadj);
6237  this->part_xadj = this->new_part_xadj;
6238  this->new_part_xadj = NULL;
6239  this->mj_env->timerStop(MACRO_TIMERS, "MultiJagged - Problem_Partitioning_" + istring);
6240  }
6241 
6242  // Partitioning is done
6243  delete future_num_part_in_parts;
6244  delete next_future_num_parts_in_parts;
6245 
6246  this->mj_env->timerStop(MACRO_TIMERS, "MultiJagged - Problem_Partitioning");
6248 
6249 
6250  //get the final parts of each initial coordinate
6251  //the results will be written to
6252  //this->assigned_part_ids for gnos given in this->current_mj_gnos
6253  this->set_final_parts(
6254  current_num_parts,
6255  output_part_begin_index,
6256  output_part_boxes,
6257  is_data_ever_migrated);
6258 
6259  result_assigned_part_ids_ = this->assigned_part_ids;
6260  result_mj_gnos_ = this->current_mj_gnos;
6261 
6262  this->free_work_memory();
6263  this->mj_env->timerStop(MACRO_TIMERS, "MultiJagged - Total");
6264  this->mj_env->debug(3, "Out of MultiJagged");
6265 
6266 }
6267 
6268 
6272 template <typename Adapter>
6273 class Zoltan2_AlgMJ : public Algorithm<Adapter>
6274 {
6275 private:
6276 
6277 #ifndef DOXYGEN_SHOULD_SKIP_THIS
6278 
6279  typedef CoordinateModel<typename Adapter::base_adapter_t> coordinateModel_t;
6280  typedef typename Adapter::scalar_t mj_scalar_t;
6281  typedef typename Adapter::gno_t mj_gno_t;
6282  typedef typename Adapter::lno_t mj_lno_t;
6283  typedef typename Adapter::node_t mj_node_t;
6284  typedef typename Adapter::part_t mj_part_t;
6286  typedef std::vector<mj_partBox_t> mj_partBoxVector_t;
6287 #endif
6289 
6290  RCP<const Environment> mj_env; //the environment object
6291  RCP<const Comm<int> > mj_problemComm; //initial comm object
6292  RCP<const coordinateModel_t> mj_coords; //coordinate adapter
6293 
6294  //PARAMETERS
6295  double imbalance_tolerance; //input imbalance tolerance.
6296  size_t num_global_parts; //the targeted number of parts
6297  mj_part_t *part_no_array; //input part array specifying num part to divide along each dim.
6298  int recursion_depth; //the number of steps that partitioning will be solved in.
6299 
6300  int coord_dim; // coordinate dimension.
6301  mj_lno_t num_local_coords; //number of local coords.
6302  mj_gno_t num_global_coords; //number of global coords.
6303  const mj_gno_t *initial_mj_gnos; //initial global ids of the coordinates.
6304  mj_scalar_t **mj_coordinates; //two dimension coordinate array
6305 
6306  int num_weights_per_coord; // number of weights per coordinate
6307  bool *mj_uniform_weights; //if the coordinates have uniform weights.
6308  mj_scalar_t **mj_weights; //two dimensional weight array
6309  bool *mj_uniform_parts; //if the target parts are uniform
6310  mj_scalar_t **mj_part_sizes; //target part weight sizes.
6311 
6312  bool distribute_points_on_cut_lines; //if partitioning can distribute points on same coordiante to different parts.
6313  mj_part_t max_concurrent_part_calculation; // how many parts we can calculate concurrently.
6314  int check_migrate_avoid_migration_option; //whether to migrate=1, avoid migrate=2, or leave decision to MJ=0
6315  mj_scalar_t minimum_migration_imbalance; //when MJ decides whether to migrate, the minimum imbalance for migration.
6316  bool mj_keep_part_boxes; //if the boxes need to be kept.
6317 
6318  int num_threads;
6319 
6320  bool mj_run_as_rcb; //if this is set, then recursion depth is adjusted to its maximum value.
6321 
6322  ArrayRCP<mj_part_t> comXAdj_; //communication graph xadj
6323  ArrayRCP<mj_part_t> comAdj_; //communication graph adj.
6324 
6325 
6326  //when we have strided data, it returns a unstrided data in RCP form.
6327  //we need to hold on to that data, during the execution of mj, so that the data is not released.
6328  //coordinate_rcp_holder will hold that data, and release it when MJ is deleted.
6329  ArrayRCP<const mj_scalar_t> * coordinate_ArrayRCP_holder;
6330 
6331  void set_up_partitioning_data(
6332  const RCP<PartitioningSolution<Adapter> >&solution);
6333 
6334  void set_input_parameters(const Teuchos::ParameterList &p);
6335 
6336  void free_work_memory();
6337 
6338  RCP<mj_partBoxVector_t> getGlobalBoxBoundaries() const;
6339 
6340 public:
6341 
6342  Zoltan2_AlgMJ(const RCP<const Environment> &env,
6343  RCP<const Comm<int> > &problemComm,
6344  const RCP<const coordinateModel_t> &coords) :
6345  mj_partitioner(), mj_env(env),
6346  mj_problemComm(problemComm),
6347  mj_coords(coords),
6348  imbalance_tolerance(0),
6349  num_global_parts(1), part_no_array(NULL),
6350  recursion_depth(0),
6351  coord_dim(0),num_local_coords(0), num_global_coords(0),
6352  initial_mj_gnos(NULL), mj_coordinates(NULL),
6353  num_weights_per_coord(0),
6354  mj_uniform_weights(NULL), mj_weights(NULL),
6355  mj_uniform_parts(NULL),
6356  mj_part_sizes(NULL),
6357  distribute_points_on_cut_lines(true),
6358  max_concurrent_part_calculation(1),
6359  check_migrate_avoid_migration_option(0),
6360  minimum_migration_imbalance(0.30),
6361  mj_keep_part_boxes(false), num_threads(1), mj_run_as_rcb(false),
6362  comXAdj_(), comAdj_(), coordinate_ArrayRCP_holder (NULL)
6363  {}
6365  if (coordinate_ArrayRCP_holder != NULL){
6366  delete [] this->coordinate_ArrayRCP_holder;
6367  this->coordinate_ArrayRCP_holder = NULL;
6368  }
6369  }
6370 
6373  static void getValidParameters(ParameterList & pl)
6374  {
6375  const bool bUnsorted = true; // this clarifies the flag is for unsrorted
6376  RCP<Zoltan2::IntegerRangeListValidator<int>> mj_parts_Validator =
6377  Teuchos::rcp( new Zoltan2::IntegerRangeListValidator<int>(bUnsorted) );
6378  pl.set("mj_parts", "0", "list of parts for multiJagged partitioning "
6379  "algorithm. As many as the dimension count.", mj_parts_Validator);
6380 
6381  pl.set("mj_concurrent_part_count", 1, "The number of parts whose cut "
6382  "coordinates will be calculated concurently.", Environment::getAnyIntValidator());
6383 
6384  pl.set("mj_minimum_migration_imbalance", 1.1,
6385  "mj_minimum_migration_imbalance, the minimum imbalance of the "
6386  "processors to avoid migration",
6388 
6389  RCP<Teuchos::EnhancedNumberValidator<int>> mj_migration_option_validator =
6390  Teuchos::rcp( new Teuchos::EnhancedNumberValidator<int>(0, 2) );
6391  pl.set("mj_migration_option", 1, "Migration option, 0 for decision "
6392  "depending on the imbalance, 1 for forcing migration, 2 for "
6393  "avoiding migration", mj_migration_option_validator);
6394 
6395  // bool parameter
6396  pl.set("mj_keep_part_boxes", false, "Keep the part boundaries of the "
6397  "geometric partitioning.", Environment::getBoolValidator());
6398 
6399  // bool parameter
6400  pl.set("mj_enable_rcb", false, "Use MJ as RCB.",
6402 
6403  pl.set("mj_recursion_depth", -1, "Recursion depth for MJ: Must be "
6404  "greater than 0.", Environment::getAnyIntValidator());
6405  }
6406 
6413  void partition(const RCP<PartitioningSolution<Adapter> > &solution);
6414 
6415  mj_partBoxVector_t &getPartBoxesView() const
6416  {
6417  RCP<mj_partBoxVector_t> pBoxes = this->getGlobalBoxBoundaries();
6418  return *pBoxes;
6419  }
6420 
6421  mj_part_t pointAssign(int dim, mj_scalar_t *point) const;
6422 
6423  void boxAssign(int dim, mj_scalar_t *lower, mj_scalar_t *upper,
6424  size_t &nPartsFound, mj_part_t **partsFound) const;
6425 
6426 
6429  void getCommunicationGraph(
6430  const PartitioningSolution<Adapter> *solution,
6431  ArrayRCP<mj_part_t> &comXAdj,
6432  ArrayRCP<mj_part_t> &comAdj);
6433 };
6434 
6435 
6445 template <typename Adapter>
6447  const RCP<PartitioningSolution<Adapter> > &solution
6448 )
6449 {
6450  this->set_up_partitioning_data(solution);
6451  this->set_input_parameters(this->mj_env->getParameters());
6452  if (this->mj_keep_part_boxes){
6453  this->mj_partitioner.set_to_keep_part_boxes();
6454  }
6455  this->mj_partitioner.set_partitioning_parameters(
6456  this->distribute_points_on_cut_lines,
6457  this->max_concurrent_part_calculation,
6458  this->check_migrate_avoid_migration_option,
6459  this->minimum_migration_imbalance);
6460 
6461  mj_part_t *result_assigned_part_ids = NULL;
6462  mj_gno_t *result_mj_gnos = NULL;
6463  this->mj_partitioner.multi_jagged_part(
6464  this->mj_env,
6465  this->mj_problemComm,
6466 
6467  this->imbalance_tolerance,
6468  this->num_global_parts,
6469  this->part_no_array,
6470  this->recursion_depth,
6471 
6472  this->coord_dim,
6473  this->num_local_coords,
6474  this->num_global_coords,
6475  this->initial_mj_gnos,
6476  this->mj_coordinates,
6477 
6478  this->num_weights_per_coord,
6479  this->mj_uniform_weights,
6480  this->mj_weights,
6481  this->mj_uniform_parts,
6482  this->mj_part_sizes,
6483 
6484  result_assigned_part_ids,
6485  result_mj_gnos
6486  );
6487 
6488  // Reorder results so that they match the order of the input
6489 #if defined(__cplusplus) && __cplusplus >= 201103L
6490  std::unordered_map<mj_gno_t, mj_lno_t> localGidToLid;
6491  localGidToLid.reserve(this->num_local_coords);
6492  for (mj_lno_t i = 0; i < this->num_local_coords; i++)
6493  localGidToLid[this->initial_mj_gnos[i]] = i;
6494 
6495  ArrayRCP<mj_part_t> partId = arcp(new mj_part_t[this->num_local_coords],
6496  0, this->num_local_coords, true);
6497 
6498  for (mj_lno_t i = 0; i < this->num_local_coords; i++) {
6499  mj_lno_t origLID = localGidToLid[result_mj_gnos[i]];
6500  partId[origLID] = result_assigned_part_ids[i];
6501  }
6502 
6503 #else
6504  Teuchos::Hashtable<mj_gno_t, mj_lno_t>
6505  localGidToLid(this->num_local_coords);
6506  for (mj_lno_t i = 0; i < this->num_local_coords; i++)
6507  localGidToLid.put(this->initial_mj_gnos[i], i);
6508 
6509  ArrayRCP<mj_part_t> partId = arcp(new mj_part_t[this->num_local_coords],
6510  0, this->num_local_coords, true);
6511 
6512  for (mj_lno_t i = 0; i < this->num_local_coords; i++) {
6513  mj_lno_t origLID = localGidToLid.get(result_mj_gnos[i]);
6514  partId[origLID] = result_assigned_part_ids[i];
6515  }
6516 
6517 #endif // C++11 is enabled
6518 
6519  delete [] result_mj_gnos;
6520  delete [] result_assigned_part_ids;
6521 
6522  solution->setParts(partId);
6523  this->free_work_memory();
6524 }
6525 
6526 /* \brief Freeing the memory allocated.
6527  * */
6528 template <typename Adapter>
6530  freeArray<mj_scalar_t *>(this->mj_coordinates);
6531  freeArray<mj_scalar_t *>(this->mj_weights);
6532  freeArray<bool>(this->mj_uniform_parts);
6533  freeArray<mj_scalar_t *>(this->mj_part_sizes);
6534  freeArray<bool>(this->mj_uniform_weights);
6535 
6536 }
6537 
6538 /* \brief Sets the partitioning data for multijagged algorithm.
6539  * */
6540 template <typename Adapter>
6541 void Zoltan2_AlgMJ<Adapter>::set_up_partitioning_data(
6542  const RCP<PartitioningSolution<Adapter> > &solution
6543 )
6544 {
6545  this->coord_dim = this->mj_coords->getCoordinateDim();
6546  this->num_weights_per_coord = this->mj_coords->getNumWeightsPerCoordinate();
6547  this->num_local_coords = this->mj_coords->getLocalNumCoordinates();
6548  this->num_global_coords = this->mj_coords->getGlobalNumCoordinates();
6549  int criteria_dim = (this->num_weights_per_coord ? this->num_weights_per_coord : 1);
6550 
6551  // From the Solution we get part information.
6552  // If the part sizes for a given criteria are not uniform,
6553  // then they are values that sum to 1.0.
6554  this->num_global_parts = solution->getTargetGlobalNumberOfParts();
6555  //allocate only two dimensional pointer.
6556  //raw pointer addresess will be obtained from multivector.
6557  this->mj_coordinates = allocMemory<mj_scalar_t *>(this->coord_dim);
6558  this->mj_weights = allocMemory<mj_scalar_t *>(criteria_dim);
6559 
6560  //if the partitioning results are to be uniform.
6561  this->mj_uniform_parts = allocMemory< bool >(criteria_dim);
6562  //if in a criteria dimension, uniform part is false this shows ratios of
6563  //the target part weights.
6564  this->mj_part_sizes = allocMemory<mj_scalar_t *>(criteria_dim);
6565  //if the weights of coordinates are uniform in a criteria dimension.
6566  this->mj_uniform_weights = allocMemory< bool >(criteria_dim);
6567 
6568  typedef StridedData<mj_lno_t, mj_scalar_t> input_t;
6569  ArrayView<const mj_gno_t> gnos;
6570  ArrayView<input_t> xyz;
6571  ArrayView<input_t> wgts;
6572 
6573 
6574  this->coordinate_ArrayRCP_holder = new ArrayRCP<const mj_scalar_t> [this->coord_dim + this->num_weights_per_coord];
6575 
6576  this->mj_coords->getCoordinates(gnos, xyz, wgts);
6577  //obtain global ids.
6578  ArrayView<const mj_gno_t> mj_gnos = gnos;
6579  this->initial_mj_gnos = mj_gnos.getRawPtr();
6580 
6581  //extract coordinates from multivector.
6582  for (int dim=0; dim < this->coord_dim; dim++){
6583  ArrayRCP<const mj_scalar_t> ar;
6584  xyz[dim].getInputArray(ar);
6585  this->coordinate_ArrayRCP_holder[dim] = ar;
6586 
6587  //multiJagged coordinate values assignment
6588  this->mj_coordinates[dim] = (mj_scalar_t *)ar.getRawPtr();
6589  }
6590 
6591  //if no weights are provided set uniform weight.
6592  if (this->num_weights_per_coord == 0){
6593  this->mj_uniform_weights[0] = true;
6594  this->mj_weights[0] = NULL;
6595  }
6596  else{
6597  //if weights are provided get weights for all weight indices
6598  for (int wdim = 0; wdim < this->num_weights_per_coord; wdim++){
6599  ArrayRCP<const mj_scalar_t> ar;
6600  wgts[wdim].getInputArray(ar);
6601  this->coordinate_ArrayRCP_holder[this->coord_dim + wdim] = ar;
6602  this->mj_uniform_weights[wdim] = false;
6603  this->mj_weights[wdim] = (mj_scalar_t *) ar.getRawPtr();
6604  }
6605  }
6606 
6607  for (int wdim = 0; wdim < criteria_dim; wdim++){
6608  if (solution->criteriaHasUniformPartSizes(wdim)){
6609  this->mj_uniform_parts[wdim] = true;
6610  this->mj_part_sizes[wdim] = NULL;
6611  }
6612  else{
6613  std::cerr << "MJ does not support non uniform target part weights" << std::endl;
6614  exit(1);
6615  }
6616  }
6617 }
6618 
6619 /* \brief Sets the partitioning parameters for multijagged algorithm.
6620  * \param pl: is the parameter list provided to zoltan2 call
6621  * */
6622 template <typename Adapter>
6623 void Zoltan2_AlgMJ<Adapter>::set_input_parameters(const Teuchos::ParameterList &pl){
6624 
6625  const Teuchos::ParameterEntry *pe = pl.getEntryPtr("imbalance_tolerance");
6626  if (pe){
6627  double tol;
6628  tol = pe->getValue(&tol);
6629  this->imbalance_tolerance = tol - 1.0;
6630  }
6631 
6632  // TODO: May be a more relaxed tolerance is needed. RCB uses 10%
6633  if (this->imbalance_tolerance <= 0)
6634  this->imbalance_tolerance= 10e-4;
6635 
6636  //if an input partitioning array is provided.
6637  this->part_no_array = NULL;
6638  //the length of the input partitioning array.
6639  this->recursion_depth = 0;
6640 
6641  if (pl.getPtr<Array <mj_part_t> >("mj_parts")){
6642  this->part_no_array = (mj_part_t *) pl.getPtr<Array <mj_part_t> >("mj_parts")->getRawPtr();
6643  this->recursion_depth = pl.getPtr<Array <mj_part_t> >("mj_parts")->size() - 1;
6644  this->mj_env->debug(2, "mj_parts provided by user");
6645  }
6646 
6647  //get mj specific parameters.
6648  this->distribute_points_on_cut_lines = true;
6649  this->max_concurrent_part_calculation = 1;
6650 
6651  this->mj_run_as_rcb = false;
6652  int mj_user_recursion_depth = -1;
6653  this->mj_keep_part_boxes = false;
6654  this->check_migrate_avoid_migration_option = 0;
6655  this->minimum_migration_imbalance = 0.35;
6656 
6657  pe = pl.getEntryPtr("mj_minimum_migration_imbalance");
6658  if (pe){
6659  double imb;
6660  imb = pe->getValue(&imb);
6661  this->minimum_migration_imbalance = imb - 1.0;
6662  }
6663 
6664  pe = pl.getEntryPtr("mj_migration_option");
6665  if (pe){
6666  this->check_migrate_avoid_migration_option = pe->getValue(&this->check_migrate_avoid_migration_option);
6667  }else {
6668  this->check_migrate_avoid_migration_option = 0;
6669  }
6670  if (this->check_migrate_avoid_migration_option > 1) this->check_migrate_avoid_migration_option = -1;
6671 
6672 
6673  pe = pl.getEntryPtr("mj_concurrent_part_count");
6674  if (pe){
6675  this->max_concurrent_part_calculation = pe->getValue(&this->max_concurrent_part_calculation);
6676  }else {
6677  this->max_concurrent_part_calculation = 1; // Set to 1 if not provided.
6678  }
6679 
6680  pe = pl.getEntryPtr("mj_keep_part_boxes");
6681  if (pe){
6682  this->mj_keep_part_boxes = pe->getValue(&this->mj_keep_part_boxes);
6683  }else {
6684  this->mj_keep_part_boxes = false; // Set to invalid value
6685  }
6686 
6687 
6688  // For now, need keep_part_boxes to do pointAssign and boxAssign.
6689  // pe = pl.getEntryPtr("keep_cuts");
6690  // if (pe){
6691  // int tmp = pe->getValue(&tmp);
6692  // if (tmp) this->mj_keep_part_boxes = true;
6693  // }
6694 
6695  //need to keep part boxes if mapping type is geometric.
6696  if (this->mj_keep_part_boxes == false){
6697  pe = pl.getEntryPtr("mapping_type");
6698  if (pe){
6699  int mapping_type = -1;
6700  mapping_type = pe->getValue(&mapping_type);
6701  if (mapping_type == 0){
6702  mj_keep_part_boxes = true;
6703  }
6704  }
6705  }
6706 
6707  //need to keep part boxes if mapping type is geometric.
6708  pe = pl.getEntryPtr("mj_enable_rcb");
6709  if (pe){
6710  this->mj_run_as_rcb = pe->getValue(&this->mj_run_as_rcb);
6711  }else {
6712  this->mj_run_as_rcb = false; // Set to invalid value
6713  }
6714 
6715  pe = pl.getEntryPtr("mj_recursion_depth");
6716  if (pe){
6717  mj_user_recursion_depth = pe->getValue(&mj_user_recursion_depth);
6718  }else {
6719  mj_user_recursion_depth = -1; // Set to invalid value
6720  }
6721 
6722  bool val = false;
6723  pe = pl.getEntryPtr("rectilinear");
6724  if (pe) val = pe->getValue(&val);
6725  if (val){
6726  this->distribute_points_on_cut_lines = false;
6727  } else {
6728  this->distribute_points_on_cut_lines = true;
6729  }
6730 
6731  if (this->mj_run_as_rcb){
6732  mj_user_recursion_depth = (int)(ceil(log ((this->num_global_parts)) / log (2.0)));
6733  }
6734  if (this->recursion_depth < 1){
6735  if (mj_user_recursion_depth > 0){
6736  this->recursion_depth = mj_user_recursion_depth;
6737  }
6738  else {
6739  this->recursion_depth = this->coord_dim;
6740  }
6741  }
6742 
6743  this->num_threads = 1;
6744 #ifdef HAVE_ZOLTAN2_OMP
6745 #pragma omp parallel
6746  {
6747  this->num_threads = omp_get_num_threads();
6748  }
6749 #endif
6750 
6751 }
6752 
6754 template <typename Adapter>
6756  int dim,
6757  typename Adapter::scalar_t *lower,
6758  typename Adapter::scalar_t *upper,
6759  size_t &nPartsFound,
6760  typename Adapter::part_t **partsFound) const
6761 {
6762  // TODO: Implement with cuts rather than boxes to reduce algorithmic
6763  // TODO: complexity. Or at least do a search through the boxes, using
6764  // TODO: p x q x r x ... if possible.
6765 
6766  nPartsFound = 0;
6767  *partsFound = NULL;
6768 
6769  if (this->mj_keep_part_boxes) {
6770 
6771  // Get vector of part boxes
6772  RCP<mj_partBoxVector_t> partBoxes = this->getGlobalBoxBoundaries();
6773 
6774  size_t nBoxes = (*partBoxes).size();
6775  if (nBoxes == 0) {
6776  throw std::logic_error("no part boxes exist");
6777  }
6778 
6779  // Determine whether the box overlaps the globalBox at all
6780  RCP<mj_partBox_t> globalBox = this->mj_partitioner.get_global_box();
6781 
6782  if (globalBox->boxesOverlap(dim, lower, upper)) {
6783 
6784  std::vector<typename Adapter::part_t> partlist;
6785 
6786  // box overlaps the global box; find specific overlapping boxes
6787  for (size_t i = 0; i < nBoxes; i++) {
6788  try {
6789  if ((*partBoxes)[i].boxesOverlap(dim, lower, upper)) {
6790  nPartsFound++;
6791  partlist.push_back((*partBoxes)[i].getpId());
6792 
6793 // std::cout << "Given box (";
6794 // for (int j = 0; j < dim; j++)
6795 // std::cout << lower[j] << " ";
6796 // std::cout << ") x (";
6797 // for (int j = 0; j < dim; j++)
6798 // std::cout << upper[j] << " ";
6799 // std::cout << ") overlaps PartBox "
6800 // << (*partBoxes)[i].getpId() << " (";
6801 // for (int j = 0; j < dim; j++)
6802 // std::cout << (*partBoxes)[i].getlmins()[j] << " ";
6803 // std::cout << ") x (";
6804 // for (int j = 0; j < dim; j++)
6805 // std::cout << (*partBoxes)[i].getlmaxs()[j] << " ";
6806 // std::cout << ")" << std::endl;
6807  }
6808  }
6810  }
6811  if (nPartsFound) {
6812  *partsFound = new mj_part_t[nPartsFound];
6813  for (size_t i = 0; i < nPartsFound; i++)
6814  (*partsFound)[i] = partlist[i];
6815  }
6816  }
6817  else {
6818  // Box does not overlap the domain at all. Find the closest part
6819  // Not sure how to perform this operation for MJ without having the
6820  // cuts. With the RCB cuts, the concept of a part extending to
6821  // infinity was natural. With the boxes, it is much more difficult.
6822  // TODO: For now, return information indicating NO OVERLAP.
6823 
6824  }
6825  }
6826  else {
6827  throw std::logic_error("need to use keep_cuts parameter for boxAssign");
6828  }
6829 }
6830 
6832 template <typename Adapter>
6833 typename Adapter::part_t Zoltan2_AlgMJ<Adapter>::pointAssign(
6834  int dim,
6835  typename Adapter::scalar_t *point) const
6836 {
6837 
6838  // TODO: Implement with cuts rather than boxes to reduce algorithmic
6839  // TODO: complexity. Or at least do a search through the boxes, using
6840  // TODO: p x q x r x ... if possible.
6841 
6842  if (this->mj_keep_part_boxes) {
6843  typename Adapter::part_t foundPart = -1;
6844 
6845  // Get vector of part boxes
6846  RCP<mj_partBoxVector_t> partBoxes = this->getGlobalBoxBoundaries();
6847 
6848  size_t nBoxes = (*partBoxes).size();
6849  if (nBoxes == 0) {
6850  throw std::logic_error("no part boxes exist");
6851  }
6852 
6853  // Determine whether the point is within the global domain
6854  RCP<mj_partBox_t> globalBox = this->mj_partitioner.get_global_box();
6855 
6856  if (globalBox->pointInBox(dim, point)) {
6857 
6858  // point is in the global domain; determine in which part it is.
6859  size_t i;
6860  for (i = 0; i < nBoxes; i++) {
6861  try {
6862  if ((*partBoxes)[i].pointInBox(dim, point)) {
6863  foundPart = (*partBoxes)[i].getpId();
6864 // std::cout << "Point (";
6865 // for (int j = 0; j < dim; j++) std::cout << point[j] << " ";
6866 // std::cout << ") found in box " << i << " part " << foundPart
6867 // << std::endl;
6868 // (*partBoxes)[i].print();
6869  break;
6870  }
6871  }
6873  }
6874 
6875  if (i == nBoxes) {
6876  // This error should never occur
6877  std::ostringstream oss;
6878  oss << "Point (";
6879  for (int j = 0; j < dim; j++) oss << point[j] << " ";
6880  oss << ") not found in domain";
6881  throw std::logic_error(oss.str());
6882  }
6883  }
6884 
6885  else {
6886  // Point is outside the global domain.
6887  // Determine to which part it is closest.
6888  // TODO: with cuts, would not need this special case
6889 
6890  size_t closestBox = 0;
6891  mj_scalar_t minDistance = std::numeric_limits<mj_scalar_t>::max();
6892  mj_scalar_t *centroid = new mj_scalar_t[dim];
6893  for (size_t i = 0; i < nBoxes; i++) {
6894  (*partBoxes)[i].computeCentroid(centroid);
6895  mj_scalar_t sum = 0.;
6896  mj_scalar_t diff;
6897  for (int j = 0; j < dim; j++) {
6898  diff = centroid[j] - point[j];
6899  sum += diff * diff;
6900  }
6901  if (sum < minDistance) {
6902  minDistance = sum;
6903  closestBox = i;
6904  }
6905  }
6906  foundPart = (*partBoxes)[closestBox].getpId();
6907  delete [] centroid;
6908  }
6909 
6910  return foundPart;
6911  }
6912  else {
6913  throw std::logic_error("need to use keep_cuts parameter for pointAssign");
6914  }
6915 }
6916 
6917 template <typename Adapter>
6919  const PartitioningSolution<Adapter> *solution,
6920  ArrayRCP<typename Zoltan2_AlgMJ<Adapter>::mj_part_t> &comXAdj,
6921  ArrayRCP<typename Zoltan2_AlgMJ<Adapter>::mj_part_t> &comAdj)
6922 {
6923  if(comXAdj_.getRawPtr() == NULL && comAdj_.getRawPtr() == NULL){
6924  RCP<mj_partBoxVector_t> pBoxes = this->getGlobalBoxBoundaries();
6925  mj_part_t ntasks = (*pBoxes).size();
6926  int dim = (*pBoxes)[0].getDim();
6927  GridHash<mj_scalar_t, mj_part_t> grid(pBoxes, ntasks, dim);
6928  grid.getAdjArrays(comXAdj_, comAdj_);
6929  }
6930  comAdj = comAdj_;
6931  comXAdj = comXAdj_;
6932 }
6933 
6934 
6935 template <typename Adapter>
6936 RCP<typename Zoltan2_AlgMJ<Adapter>::mj_partBoxVector_t>
6938 {
6939  return this->mj_partitioner.get_kept_boxes();
6940 }
6941 
6942 
6943 template <typename mj_scalar_t, typename mj_lno_t, typename mj_gno_t,
6944  typename mj_part_t>
6945 RCP<typename AlgMJ<mj_scalar_t,mj_lno_t,mj_gno_t,mj_part_t>::mj_partBoxVector_t>
6947 {
6948  if (this->mj_keep_part_boxes)
6949  return this->kept_boxes;
6950  else
6951  throw std::logic_error("Error: part boxes are not stored.");
6952 }
6953 
6954 template <typename mj_scalar_t, typename mj_lno_t, typename mj_gno_t,
6955  typename mj_part_t>
6956 RCP<typename AlgMJ<mj_scalar_t,mj_lno_t,mj_gno_t,mj_part_t>::mj_partBoxVector_t>
6958  RCP<mj_partBoxVector_t> &localPartBoxes
6959 ) const
6960 {
6961  mj_part_t ntasks = this->num_global_parts;
6962  int dim = (*localPartBoxes)[0].getDim();
6963  mj_scalar_t *localPartBoundaries = new mj_scalar_t[ntasks * 2 *dim];
6964 
6965  memset(localPartBoundaries, 0, sizeof(mj_scalar_t) * ntasks * 2 *dim);
6966 
6967  mj_scalar_t *globalPartBoundaries = new mj_scalar_t[ntasks * 2 *dim];
6968  memset(globalPartBoundaries, 0, sizeof(mj_scalar_t) * ntasks * 2 *dim);
6969 
6970  mj_scalar_t *localPartMins = localPartBoundaries;
6971  mj_scalar_t *localPartMaxs = localPartBoundaries + ntasks * dim;
6972 
6973  mj_scalar_t *globalPartMins = globalPartBoundaries;
6974  mj_scalar_t *globalPartMaxs = globalPartBoundaries + ntasks * dim;
6975 
6976  mj_part_t boxCount = localPartBoxes->size();
6977  for (mj_part_t i = 0; i < boxCount; ++i){
6978  mj_part_t pId = (*localPartBoxes)[i].getpId();
6979  //cout << "me:" << comm->getRank() << " has:" << pId << endl;
6980 
6981  mj_scalar_t *lmins = (*localPartBoxes)[i].getlmins();
6982  mj_scalar_t *lmaxs = (*localPartBoxes)[i].getlmaxs();
6983 
6984  for (int j = 0; j < dim; ++j){
6985  localPartMins[dim * pId + j] = lmins[j];
6986  localPartMaxs[dim * pId + j] = lmaxs[j];
6987  /*
6988  cout << "me:" << comm->getRank() <<
6989  " dim * pId + j:"<< dim * pId + j <<
6990  " localMin:" << localPartMins[dim * pId + j] <<
6991  " localMax:" << localPartMaxs[dim * pId + j] << endl;
6992  */
6993  }
6994  }
6995 
6996  Teuchos::Zoltan2_BoxBoundaries<int, mj_scalar_t> reductionOp(ntasks * 2 *dim);
6997 
6998  reduceAll<int, mj_scalar_t>(*mj_problemComm, reductionOp,
6999  ntasks * 2 *dim, localPartBoundaries, globalPartBoundaries);
7000  RCP<mj_partBoxVector_t> pB(new mj_partBoxVector_t(),true);
7001  for (mj_part_t i = 0; i < ntasks; ++i){
7003  globalPartMins + dim * i,
7004  globalPartMaxs + dim * i);
7005 
7006  /*
7007  for (int j = 0; j < dim; ++j){
7008  cout << "me:" << comm->getRank() <<
7009  " dim * pId + j:"<< dim * i + j <<
7010  " globalMin:" << globalPartMins[dim * i + j] <<
7011  " globalMax:" << globalPartMaxs[dim * i + j] << endl;
7012  }
7013  */
7014  pB->push_back(tpb);
7015  }
7016  delete []localPartBoundaries;
7017  delete []globalPartBoundaries;
7018  //RCP <mj_partBoxVector_t> tmpRCPBox(pB, true);
7019  return pB;
7020 }
7021 } // namespace Zoltan2
7022 
7023 #endif
#define MIN_WORK_LAST_DIM
GridHash Class, Hashing Class for part boxes.
Time an algorithm (or other entity) as a whole.
void sequential_task_partitioning(const RCP< const Environment > &env, mj_lno_t num_total_coords, mj_lno_t num_selected_coords, size_t num_target_part, int coord_dim, mj_scalar_t **mj_coordinates, mj_lno_t *initial_selected_coords_output_permutation, mj_lno_t *output_xadj, int recursion_depth, const mj_part_t *part_no_array, bool partition_along_longest_dim)
Special function for partitioning for task mapping. Runs sequential, and performs deterministic parti...
void set_partitioning_parameters(bool distribute_points_on_cut_lines_, int max_concurrent_part_calculation_, int check_migrate_avoid_migration_option_, mj_scalar_t minimum_migration_imbalance_)
Multi Jagged coordinate partitioning algorithm.
bool operator>(const uMultiSortItem< IT, CT, WT > &other) const
#define Z2_FORWARD_EXCEPTIONS
Forward an exception back through call stack.
Defines Parameter related enumerators, declares functions.
AlgMJ()
Multi Jagged coordinate partitioning algorithm default constructor.
size_t global_size_t
static RCP< Teuchos::BoolParameterEntryValidator > getBoolValidator()
Exists to make setting up validators less cluttered.
bool operator>(const uSignedSortItem< IT, WT, SIGN > &rhs) const
void getAdjArrays(ArrayRCP< part_t > &comXAdj_, ArrayRCP< part_t > &comAdj_)
GridHash Class, returns the adj arrays.
Zoltan2_BoxBoundaries(Ordinal s_)
Constructor.
Sort items for quick sort function.
mj_part_t pointAssign(int dim, mj_scalar_t *point) const
void uqSignsort(IT n, uSignedSortItem< IT, WT, SIGN > *arr)
Quick sort function. Sorts the arr of uSignedSortItems, with respect to increasing vals...
#define imbalanceOf2(Wachieved, wExpected)
void freeArray(T *&array)
Frees the given array.
Class for sorting items with multiple values. First sorting with respect to val[0], then val[1] then ... val[count-1]. The last tie breaking is done with index values. Used for task mapping partitioning where the points on a cut line needs to be distributed consistently.
static ArrayRCP< ArrayRCP< zscalar_t > > weights
void multi_jagged_part(const RCP< const Environment > &env, RCP< const Comm< int > > &problemComm, double imbalance_tolerance, size_t num_global_parts, mj_part_t *part_no_array, int recursion_depth, int coord_dim, mj_lno_t num_local_coords, mj_gno_t num_global_coords, const mj_gno_t *initial_mj_gnos, mj_scalar_t **mj_coordinates, int num_weights_per_coord, bool *mj_uniform_weights, mj_scalar_t **mj_weights, bool *mj_uniform_parts, mj_scalar_t **mj_part_sizes, mj_part_t *&result_assigned_part_ids, mj_gno_t *&result_mj_gnos)
Multi Jagged coordinate partitioning algorithm.
static RCP< Teuchos::AnyNumberParameterEntryValidator > getAnyDoubleValidator()
Exists to make setting up validators less cluttered.
void partition(const RCP< PartitioningSolution< Adapter > > &solution)
Multi Jagged coordinate partitioning algorithm.
#define SIGNIFICANCE_MUL
coordinateModelPartBox Class, represents the boundaries of the box which is a result of a geometric p...
A ParameterList validator for integer range lists.
void set_to_keep_part_boxes()
Function call, if the part boxes are intended to be kept.
void getCommunicationGraph(const PartitioningSolution< Adapter > *solution, ArrayRCP< mj_part_t > &comXAdj, ArrayRCP< mj_part_t > &comAdj)
returns communication graph resulting from MJ partitioning.
bool operator>=(const uSignedSortItem< IT, WT, SIGN > &rhs)
#define FUTURE_REDUCEALL_CUTOFF
Multi Jagged coordinate partitioning algorithm.
This class provides geometric coordinates with optional weights to the Zoltan2 algorithm.
T * allocMemory(size_t size)
Allocates memory for the given size.
dictionary vals
Definition: xml2dox.py:186
uMultiSortItem< IT, CT, WT > operator=(const uMultiSortItem< IT, CT, WT > &other)
RCP< mj_partBox_t > get_global_box() const
Return the global bounding box: min/max coords of global domain.
A PartitioningSolution is a solution to a partitioning problem.
Zoltan2_BoxBoundaries()
Default Constructor.
#define ZOLTAN2_ABS(x)
uMultiSortItem(const uMultiSortItem< IT, CT, WT > &other)
#define LEAST_SIGNIFICANCE
RCP< mj_partBoxVector_t > get_kept_boxes() const
mj_partBoxVector_t & getPartBoxesView() const
for partitioning methods, return bounding boxes of the
Zoltan2_BoxBoundaries is a reduction operation to all reduce the all box boundaries.
Algorithm defines the base class for all algorithms.
#define Z2_ASSERT_VALUE(actual, expected)
Throw an error when actual value is not equal to expected value.
static RCP< Teuchos::AnyNumberParameterEntryValidator > getAnyIntValidator()
Exists to make setting up validators less cluttered.
uMultiSortItem(IT index_, CT count_, WT *vals_)
void reduce(const Ordinal count, const T inBuffer[], T inoutBuffer[]) const
Implement Teuchos::ValueTypeReductionOp interface.
Define IntegerRangeList validator.
Defines the CoordinateModel classes.
#define Z2_THROW_OUTSIDE_ERROR(env)
Throw an error returned from outside the Zoltan2 library.
#define ZOLTAN2_ALGMULTIJAGGED_SWAP(a, b, temp)
#define epsilon
A gathering of useful namespace methods.
void uqsort(IT n, uSortItem< IT, WT > *arr)
Quick sort function. Sorts the arr of uSortItems, with respect to increasing vals.
Zoltan2_AlgMJ(const RCP< const Environment > &env, RCP< const Comm< int > > &problemComm, const RCP< const coordinateModel_t > &coords)
Contains Teuchos redcution operators for the Multi-jagged algorthm.
void boxAssign(int dim, mj_scalar_t *lower, mj_scalar_t *upper, size_t &nPartsFound, mj_part_t **partsFound) const
RCP< mj_partBoxVector_t > compute_global_box_boundaries(RCP< mj_partBoxVector_t > &localPartBoxes) const
Multi Jagged coordinate partitioning algorithm.
static void getValidParameters(ParameterList &pl)
Set up validators specific to this algorithm.