49 #ifndef _ZOLTAN2_ALGMultiJagged_HPP_ 50 #define _ZOLTAN2_ALGMultiJagged_HPP_ 57 #include <Teuchos_StandardParameterEntryValidators.hpp> 59 #include <Tpetra_Distributor.hpp> 60 #include <Teuchos_ParameterList.hpp> 67 #if defined(__cplusplus) && __cplusplus >= 201103L 68 #include <unordered_map> 70 #include <Teuchos_Hashtable.hpp> 71 #endif // C++11 is enabled 73 #ifdef ZOLTAN2_USEZOLTANCOMM 74 #ifdef HAVE_ZOLTAN2_MPI 75 #define ENABLE_ZOLTAN_MIGRATION 76 #include "zoltan_comm_cpp.h" 77 #include "zoltan_types.h" 81 #ifdef HAVE_ZOLTAN2_OMP 85 #define LEAST_SIGNIFICANCE 0.0001 86 #define SIGNIFICANCE_MUL 1000 91 #define FUTURE_REDUCEALL_CUTOFF 1500000 94 #define MIN_WORK_LAST_DIM 1000 99 #define ZOLTAN2_ABS(x) ((x) >= 0 ? (x) : -(x)) 101 #define imbalanceOf(Wachieved, totalW, expectedRatio) \ 102 (Wachieved) / ((totalW) * (expectedRatio)) - 1 103 #define imbalanceOf2(Wachieved, wExpected) \ 104 (Wachieved) / (wExpected) - 1 107 #define ZOLTAN2_ALGMULTIJAGGED_SWAP(a,b,temp) temp=(a);(a)=(b);(b)=temp; 116 template <
typename Ordinal,
typename T>
135 size(s_), _EPSILON (
std::numeric_limits<T>::
epsilon()){}
139 void reduce(
const Ordinal count,
const T inBuffer[], T inoutBuffer[])
const 141 for (Ordinal i=0; i < count; i++){
142 if (
Z2_ABS(inBuffer[i]) > _EPSILON){
143 inoutBuffer[i] = inBuffer[i];
155 template <
typename T>
160 throw "cannot allocate memory";
172 template <
typename T>
188 template <
typename IT,
typename CT,
typename WT>
207 this->index = index_;
208 this->count = count_;
214 this->index = other.
index;
215 this->count = other.
count;
216 this->val = other.
val;
224 void set(IT index_ ,CT count_, WT *vals_){
225 this->index = index_;
226 this->count = count_;
232 this->index = other.
index;
233 this->count = other.
count;
234 this->val = other.
val;
238 bool operator<(const uMultiSortItem<IT,CT,WT>& other)
const{
239 assert (this->count == other.count);
240 for(CT i = 0; i < this->
count; ++i){
242 if (
ZOLTAN2_ABS(this->val[i] - other.val[i]) < this->_EPSILON){
246 if(this->val[i] < other.val[i]){
255 return this->index < other.index;
258 assert (this->count == other.
count);
259 for(CT i = 0; i < this->
count; ++i){
265 if(this->val[i] > other.
val[i]){
275 return this->index > other.
index;
282 template <
class IT,
class WT>
293 template <
class IT,
class WT>
299 IT i, ir=n, j, k, l=1;
300 IT jstack=0, istack[50];
309 for (j=l+1;j<=ir;j++)
315 if (arr[i].val <= aval)
331 if (arr[l+1].val > arr[ir].val)
335 if (arr[l].val > arr[ir].val)
339 if (arr[l+1].val > arr[l].val)
349 do i++;
while (arr[i].val < aval);
350 do j--;
while (arr[j].val > aval);
357 if (jstack > NSTACK){
358 std::cout <<
"uqsort: NSTACK too small in sort." << std::endl;
377 template <
class IT,
class WT,
class SIGN>
384 bool operator<(const uSignedSortItem<IT, WT, SIGN>& rhs)
const {
386 if (this->signbit < rhs.signbit){
390 else if (this->signbit == rhs.signbit){
392 if (this->val < rhs.val){
396 else if (this->val > rhs.val){
412 if (this->signbit > rhs.
signbit){
416 else if (this->signbit == rhs.
signbit){
418 if (this->val < rhs.
val){
422 else if (this->val > rhs.
val){
435 bool operator<=(const uSignedSortItem<IT, WT, SIGN>& rhs){
436 return !(*
this > rhs);}
438 return !(*
this < rhs);}
444 template <
class IT,
class WT,
class SIGN>
449 IT i, ir=n, j, k, l=1;
450 IT jstack=0, istack[50];
458 for (j=l+1;j<=ir;j++)
480 if (arr[l+1] > arr[ir])
484 if (arr[l] > arr[ir])
488 if (arr[l+1] > arr[l])
497 do i++;
while (arr[i] < a);
498 do j--;
while (arr[j] > a);
505 if (jstack > NSTACK){
506 std::cout <<
"uqsort: NSTACK too small in sort." << std::endl;
528 template <
typename mj_scalar_t,
typename mj_lno_t,
typename mj_gno_t,
534 typedef std::vector<mj_partBox_t> mj_partBoxVector_t;
536 RCP<const Environment> mj_env;
537 RCP<const Comm<int> > mj_problemComm;
539 double imbalance_tolerance;
540 mj_part_t *part_no_array;
542 int coord_dim, num_weights_per_coord;
544 size_t initial_num_loc_coords;
547 mj_lno_t num_local_coords;
548 mj_gno_t num_global_coords;
550 mj_scalar_t **mj_coordinates;
551 mj_scalar_t **mj_weights;
552 bool *mj_uniform_parts;
553 mj_scalar_t **mj_part_sizes;
554 bool *mj_uniform_weights;
556 ArrayView<const mj_gno_t> mj_gnos;
557 size_t num_global_parts;
559 mj_gno_t *initial_mj_gnos;
560 mj_gno_t *current_mj_gnos;
561 int *owner_of_coordinate;
563 mj_lno_t *coordinate_permutations;
564 mj_lno_t *new_coordinate_permutations;
565 mj_part_t *assigned_part_ids;
568 mj_lno_t *new_part_xadj;
571 bool distribute_points_on_cut_lines;
572 mj_part_t max_concurrent_part_calculation;
575 int mj_user_recursion_depth;
576 bool mj_keep_part_boxes;
578 int check_migrate_avoid_migration_option;
579 mj_scalar_t minimum_migration_imbalance;
582 mj_part_t total_num_cut ;
583 mj_part_t total_num_part;
585 mj_part_t max_num_part_along_dim ;
586 mj_part_t max_num_cut_along_dim;
587 size_t max_num_total_part_along_dim;
589 mj_part_t total_dim_num_reduce_all;
590 mj_part_t last_dim_num_part;
594 RCP<Comm<int> > comm;
596 mj_scalar_t sEpsilon;
598 mj_scalar_t maxScalar_t;
599 mj_scalar_t minScalar_t;
601 mj_scalar_t *all_cut_coordinates;
602 mj_scalar_t *max_min_coords;
603 mj_scalar_t *process_cut_line_weight_to_put_left;
604 mj_scalar_t **thread_cut_line_weight_to_put_left;
610 mj_scalar_t *cut_coordinates_work_array;
613 mj_scalar_t *target_part_weights;
615 mj_scalar_t *cut_upper_bound_coordinates ;
616 mj_scalar_t *cut_lower_bound_coordinates ;
617 mj_scalar_t *cut_lower_bound_weights ;
618 mj_scalar_t *cut_upper_bound_weights ;
620 mj_scalar_t *process_local_min_max_coord_total_weight ;
621 mj_scalar_t *global_min_max_coord_total_weight ;
625 bool *is_cut_line_determined;
628 mj_part_t *my_incomplete_cut_count;
630 double **thread_part_weights;
632 double **thread_part_weight_work;
635 mj_scalar_t **thread_cut_left_closest_point;
637 mj_scalar_t **thread_cut_right_closest_point;
640 mj_lno_t **thread_point_counts;
642 mj_scalar_t *process_rectilinear_cut_weight;
643 mj_scalar_t *global_rectilinear_cut_weight;
649 mj_scalar_t *total_part_weight_left_right_closests ;
650 mj_scalar_t *global_total_part_weight_left_right_closests;
652 RCP<mj_partBoxVector_t> kept_boxes;
655 RCP<mj_partBox_t> global_box;
656 int myRank, myActualRank;
665 void set_part_specifications();
672 inline mj_part_t get_part_count(
673 mj_part_t num_total_future,
679 void allocate_set_work_memory();
686 void init_part_boxes(RCP<mj_partBoxVector_t> & outPartBoxes);
689 void compute_global_box();
705 mj_part_t update_part_num_arrays(
706 std::vector<mj_part_t> &num_partitioning_in_current_dim,
707 std::vector<mj_part_t> *future_num_part_in_parts,
708 std::vector<mj_part_t> *next_future_num_parts_in_parts,
709 mj_part_t &future_num_parts,
710 mj_part_t current_num_parts,
711 int current_iteration,
712 RCP<mj_partBoxVector_t> input_part_boxes,
713 RCP<mj_partBoxVector_t> output_part_boxes);
726 void mj_get_local_min_max_coord_totW(
727 mj_lno_t coordinate_begin_index,
728 mj_lno_t coordinate_end_index,
729 mj_lno_t *mj_current_coordinate_permutations,
730 mj_scalar_t *mj_current_dim_coords,
731 mj_scalar_t &min_coordinate,
732 mj_scalar_t &max_coordinate,
733 mj_scalar_t &total_weight);
742 void mj_get_global_min_max_coord_totW(
743 mj_part_t current_concurrent_num_parts,
744 mj_scalar_t *local_min_max_total,
745 mj_scalar_t *global_min_max_total);
765 void mj_get_initial_cut_coords_target_weights(
766 mj_scalar_t min_coord,
767 mj_scalar_t max_coord,
769 mj_scalar_t global_weight,
770 mj_scalar_t *initial_cut_coords ,
771 mj_scalar_t *target_part_weights ,
773 std::vector <mj_part_t> *future_num_part_in_parts,
774 std::vector <mj_part_t> *next_future_num_parts_in_parts,
775 mj_part_t concurrent_current_part,
776 mj_part_t obtained_part_index);
790 void set_initial_coordinate_parts(
791 mj_scalar_t &max_coordinate,
792 mj_scalar_t &min_coordinate,
793 mj_part_t &concurrent_current_part_index,
794 mj_lno_t coordinate_begin_index,
795 mj_lno_t coordinate_end_index,
796 mj_lno_t *mj_current_coordinate_permutations,
797 mj_scalar_t *mj_current_dim_coords,
798 mj_part_t *mj_part_ids,
799 mj_part_t &partition_count);
812 mj_scalar_t *mj_current_dim_coords,
813 mj_scalar_t imbalanceTolerance,
814 mj_part_t current_work_part,
815 mj_part_t current_concurrent_num_parts,
816 mj_scalar_t *current_cut_coordinates,
817 mj_part_t total_incomplete_cut_count,
818 std::vector <mj_part_t> &num_partitioning_in_current_dim);
839 void mj_1D_part_get_thread_part_weights(
840 size_t total_part_count,
842 mj_scalar_t max_coord,
843 mj_scalar_t min_coord,
844 mj_lno_t coordinate_begin_index,
845 mj_lno_t coordinate_end_index,
846 mj_scalar_t *mj_current_dim_coords,
847 mj_scalar_t *temp_current_cut_coords,
848 bool *current_cut_status,
849 double *my_current_part_weights,
850 mj_scalar_t *my_current_left_closest,
851 mj_scalar_t *my_current_right_closest);
860 void mj_accumulate_thread_results(
861 const std::vector <mj_part_t> &num_partitioning_in_current_dim,
862 mj_part_t current_work_part,
863 mj_part_t current_concurrent_num_parts);
895 void mj_get_new_cut_coordinates(
896 const size_t &num_total_part,
897 const mj_part_t &num_cuts,
898 const mj_scalar_t &max_coordinate,
899 const mj_scalar_t &min_coordinate,
900 const mj_scalar_t &global_total_weight,
901 const mj_scalar_t &used_imbalance_tolerance,
902 mj_scalar_t * current_global_part_weights,
903 const mj_scalar_t * current_local_part_weights,
904 const mj_scalar_t *current_part_target_weights,
905 bool *current_cut_line_determined,
906 mj_scalar_t *current_cut_coordinates,
907 mj_scalar_t *current_cut_upper_bounds,
908 mj_scalar_t *current_cut_lower_bounds,
909 mj_scalar_t *current_global_left_closest_points,
910 mj_scalar_t *current_global_right_closest_points,
911 mj_scalar_t * current_cut_lower_bound_weights,
912 mj_scalar_t * current_cut_upper_weights,
913 mj_scalar_t *new_current_cut_coordinates,
914 mj_scalar_t *current_part_cut_line_weight_to_put_left,
915 mj_part_t *rectilinear_cut_count,
916 mj_part_t &my_num_incomplete_cut);
927 void mj_calculate_new_cut_position (
928 mj_scalar_t cut_upper_bound,
929 mj_scalar_t cut_lower_bound,
930 mj_scalar_t cut_upper_weight,
931 mj_scalar_t cut_lower_weight,
932 mj_scalar_t expected_weight,
933 mj_scalar_t &new_cut_position);
945 void mj_create_new_partitions(
947 mj_scalar_t *mj_current_dim_coords,
948 mj_scalar_t *current_concurrent_cut_coordinate,
949 mj_lno_t coordinate_begin,
950 mj_lno_t coordinate_end,
951 mj_scalar_t *used_local_cut_line_weight_to_left,
952 double **used_thread_part_weight_work,
953 mj_lno_t *out_part_xadj);
977 bool mj_perform_migration(
978 mj_part_t in_num_parts,
979 mj_part_t &out_num_parts,
980 std::vector<mj_part_t> *next_future_num_parts_in_parts,
981 mj_part_t &output_part_begin_index,
982 size_t migration_reduce_all_population,
983 mj_lno_t num_coords_for_last_dim_part,
984 std::string iteration,
985 RCP<mj_partBoxVector_t> &input_part_boxes,
986 RCP<mj_partBoxVector_t> &output_part_boxes);
997 void get_processor_num_points_in_parts(
1000 mj_gno_t *&num_points_in_all_processor_parts);
1014 bool mj_check_to_migrate(
1015 size_t migration_reduce_all_population,
1016 mj_lno_t num_coords_for_last_dim_part,
1017 mj_part_t num_procs,
1018 mj_part_t num_parts,
1019 mj_gno_t *num_points_in_all_processor_parts);
1039 void mj_migration_part_proc_assignment(
1040 mj_gno_t * num_points_in_all_processor_parts,
1041 mj_part_t num_parts,
1042 mj_part_t num_procs,
1043 mj_lno_t *send_count_to_each_proc,
1044 std::vector<mj_part_t> &processor_ranks_for_subcomm,
1045 std::vector<mj_part_t> *next_future_num_parts_in_parts,
1046 mj_part_t &out_num_part,
1047 std::vector<mj_part_t> &out_part_indices,
1048 mj_part_t &output_part_numbering_begin_index,
1049 int *coordinate_destinations);
1067 void mj_assign_proc_to_parts(
1068 mj_gno_t * num_points_in_all_processor_parts,
1069 mj_part_t num_parts,
1070 mj_part_t num_procs,
1071 mj_lno_t *send_count_to_each_proc,
1072 std::vector<mj_part_t> &processor_ranks_for_subcomm,
1073 std::vector<mj_part_t> *next_future_num_parts_in_parts,
1074 mj_part_t &out_part_index,
1075 mj_part_t &output_part_numbering_begin_index,
1076 int *coordinate_destinations);
1088 void assign_send_destinations(
1089 mj_part_t num_parts,
1090 mj_part_t *part_assignment_proc_begin_indices,
1091 mj_part_t *processor_chains_in_parts,
1092 mj_lno_t *send_count_to_each_proc,
1093 int *coordinate_destinations);
1107 void assign_send_destinations2(
1108 mj_part_t num_parts,
1110 int *coordinate_destinations,
1111 mj_part_t &output_part_numbering_begin_index,
1112 std::vector<mj_part_t> *next_future_num_parts_in_parts);
1130 void mj_assign_parts_to_procs(
1131 mj_gno_t * num_points_in_all_processor_parts,
1132 mj_part_t num_parts,
1133 mj_part_t num_procs,
1134 mj_lno_t *send_count_to_each_proc,
1135 std::vector<mj_part_t> *next_future_num_parts_in_parts,
1136 mj_part_t &out_num_part,
1137 std::vector<mj_part_t> &out_part_indices,
1138 mj_part_t &output_part_numbering_begin_index,
1139 int *coordinate_destinations);
1153 void mj_migrate_coords(
1154 mj_part_t num_procs,
1155 mj_lno_t &num_new_local_points,
1156 std::string iteration,
1157 int *coordinate_destinations,
1158 mj_part_t num_parts);
1166 void create_sub_communicator(std::vector<mj_part_t> &processor_ranks_for_subcomm);
1174 void fill_permutation_array(
1175 mj_part_t output_num_parts,
1176 mj_part_t num_parts);
1186 void set_final_parts(
1187 mj_part_t current_num_parts,
1188 mj_part_t output_part_begin_index,
1189 RCP<mj_partBoxVector_t> &output_part_boxes,
1190 bool is_data_ever_migrated);
1193 void free_work_memory();
1207 void create_consistent_chunks(
1208 mj_part_t num_parts,
1209 mj_scalar_t *mj_current_dim_coords,
1210 mj_scalar_t *current_concurrent_cut_coordinate,
1211 mj_lno_t coordinate_begin,
1212 mj_lno_t coordinate_end,
1213 mj_scalar_t *used_local_cut_line_weight_to_left,
1214 mj_lno_t *out_part_xadj,
1248 const RCP<const Environment> &env,
1249 RCP<
const Comm<int> > &problemComm,
1251 double imbalance_tolerance,
1252 size_t num_global_parts,
1253 mj_part_t *part_no_array,
1254 int recursion_depth,
1257 mj_lno_t num_local_coords,
1258 mj_gno_t num_global_coords,
1259 const mj_gno_t *initial_mj_gnos,
1260 mj_scalar_t **mj_coordinates,
1262 int num_weights_per_coord,
1263 bool *mj_uniform_weights,
1264 mj_scalar_t **mj_weights,
1265 bool *mj_uniform_parts,
1266 mj_scalar_t **mj_part_sizes,
1268 mj_part_t *&result_assigned_part_ids,
1269 mj_gno_t *&result_mj_gnos
1280 bool distribute_points_on_cut_lines_,
1281 int max_concurrent_part_calculation_,
1282 int check_migrate_avoid_migration_option_,
1283 mj_scalar_t minimum_migration_imbalance_);
1296 RCP<mj_partBoxVector_t> &localPartBoxes)
const;
1323 const RCP<const Environment> &env,
1324 mj_lno_t num_total_coords,
1325 mj_lno_t num_selected_coords,
1326 size_t num_target_part,
1328 mj_scalar_t **mj_coordinates,
1329 mj_lno_t *initial_selected_coords_output_permutation,
1330 mj_lno_t *output_xadj,
1331 int recursion_depth,
1332 const mj_part_t *part_no_array,
1333 bool partition_along_longest_dim);
1361 template <
typename mj_scalar_t,
typename mj_lno_t,
typename mj_gno_t,
1364 const RCP<const Environment> &env,
1365 mj_lno_t num_total_coords,
1366 mj_lno_t num_selected_coords,
1367 size_t num_target_part,
1369 mj_scalar_t **mj_coordinates_,
1370 mj_lno_t *inital_adjList_output_adjlist,
1371 mj_lno_t *output_xadj,
1373 const mj_part_t *part_no_array_,
1374 bool partition_along_longest_dim
1378 const RCP<Comm<int> > commN;
1379 this->mj_problemComm =
1380 Teuchos::DefaultComm<int>::getDefaultSerialComm(commN);
1382 Teuchos::rcp_const_cast<Comm<int> >(this->mj_problemComm);
1383 this->myActualRank = this->myRank = 1;
1385 #ifdef HAVE_ZOLTAN2_OMP 1386 int actual_num_threads = omp_get_num_threads();
1387 omp_set_num_threads(1);
1395 this->imbalance_tolerance = 0;
1396 this->num_global_parts = num_target_part;
1397 this->part_no_array = (mj_part_t *)part_no_array_;
1398 this->recursion_depth = rd;
1400 this->coord_dim = coord_dim_;
1401 this->num_local_coords = num_total_coords;
1402 this->num_global_coords = num_total_coords;
1403 this->mj_coordinates = mj_coordinates_;
1407 this->initial_mj_gnos = allocMemory<mj_gno_t>(this->num_local_coords);
1409 this->num_weights_per_coord = 0;
1410 bool *tmp_mj_uniform_weights =
new bool[1];
1411 this->mj_uniform_weights = tmp_mj_uniform_weights ;
1412 this->mj_uniform_weights[0] =
true;
1414 mj_scalar_t **tmp_mj_weights =
new mj_scalar_t *[1];
1415 this->mj_weights = tmp_mj_weights;
1417 bool *tmp_mj_uniform_parts =
new bool[1];
1418 this->mj_uniform_parts = tmp_mj_uniform_parts;
1419 this->mj_uniform_parts[0] =
true;
1421 mj_scalar_t **tmp_mj_part_sizes =
new mj_scalar_t * [1];
1422 this->mj_part_sizes = tmp_mj_part_sizes;
1423 this->mj_part_sizes[0] = NULL;
1425 this->num_threads = 1;
1426 this->set_part_specifications();
1428 this->allocate_set_work_memory();
1430 this->part_xadj[0] =
static_cast<mj_lno_t
>(num_selected_coords);
1431 for(
size_t i = 0; i < static_cast<size_t>(num_total_coords); ++i){
1432 this->coordinate_permutations[i] = inital_adjList_output_adjlist[i];
1435 mj_part_t current_num_parts = 1;
1437 mj_scalar_t *current_cut_coordinates = this->all_cut_coordinates;
1439 mj_part_t future_num_parts = this->total_num_part;
1441 std::vector<mj_part_t> *future_num_part_in_parts =
new std::vector<mj_part_t> ();
1442 std::vector<mj_part_t> *next_future_num_parts_in_parts =
new std::vector<mj_part_t> ();
1443 next_future_num_parts_in_parts->push_back(this->num_global_parts);
1444 RCP<mj_partBoxVector_t> t1;
1445 RCP<mj_partBoxVector_t> t2;
1448 std::vector <uSignedSortItem<int, mj_scalar_t, char> > coord_dimension_range_sorted(this->coord_dim);
1450 std::vector <mj_scalar_t> coord_dim_mins(this->coord_dim);
1451 std::vector <mj_scalar_t> coord_dim_maxs(this->coord_dim);
1453 for (
int i = 0; i < this->recursion_depth; ++i){
1457 std::vector <mj_part_t> num_partitioning_in_current_dim;
1471 std::vector<mj_part_t> *tmpPartVect= future_num_part_in_parts;
1472 future_num_part_in_parts = next_future_num_parts_in_parts;
1473 next_future_num_parts_in_parts = tmpPartVect;
1478 next_future_num_parts_in_parts->clear();
1482 mj_part_t output_part_count_in_dimension =
1483 this->update_part_num_arrays(
1484 num_partitioning_in_current_dim,
1485 future_num_part_in_parts,
1486 next_future_num_parts_in_parts,
1496 if(output_part_count_in_dimension == current_num_parts) {
1497 tmpPartVect= future_num_part_in_parts;
1498 future_num_part_in_parts = next_future_num_parts_in_parts;
1499 next_future_num_parts_in_parts = tmpPartVect;
1504 std::string istring = Teuchos::toString<int>(i);
1508 this->new_part_xadj = allocMemory<mj_lno_t>(output_part_count_in_dimension);
1511 mj_part_t output_part_index = 0;
1514 mj_part_t output_coordinate_end_index = 0;
1516 mj_part_t current_work_part = 0;
1517 mj_part_t current_concurrent_num_parts = 1;
1519 mj_part_t obtained_part_index = 0;
1522 int coordInd = i % this->coord_dim;
1523 mj_scalar_t * mj_current_dim_coords = this->mj_coordinates[coordInd];
1527 for (; current_work_part < current_num_parts;
1528 current_work_part += current_concurrent_num_parts){
1534 mj_part_t actual_work_part_count = 0;
1538 for(
int kk = 0; kk < current_concurrent_num_parts; ++kk){
1539 mj_part_t current_work_part_in_concurrent_parts = current_work_part + kk;
1543 if (num_partitioning_in_current_dim[current_work_part_in_concurrent_parts] == 1){
1546 ++actual_work_part_count;
1547 mj_lno_t coordinate_end_index= this->part_xadj[current_work_part_in_concurrent_parts];
1548 mj_lno_t coordinate_begin_index = current_work_part_in_concurrent_parts==0 ? 0: this->part_xadj[current_work_part_in_concurrent_parts -1];
1558 if(partition_along_longest_dim){
1560 mj_scalar_t best_weight_coord = 0;
1561 for (
int coord_traverse_ind = 0; coord_traverse_ind < this->coord_dim; ++coord_traverse_ind){
1562 mj_scalar_t best_min_coord = 0;
1563 mj_scalar_t best_max_coord = 0;
1566 this->mj_get_local_min_max_coord_totW(
1567 coordinate_begin_index,
1568 coordinate_end_index,
1569 this->coordinate_permutations,
1570 this->mj_coordinates[coord_traverse_ind],
1576 coord_dim_mins[coord_traverse_ind] = best_min_coord;
1577 coord_dim_maxs[coord_traverse_ind] = best_max_coord;
1578 mj_scalar_t best_range = best_max_coord - best_min_coord;
1579 coord_dimension_range_sorted[coord_traverse_ind].id = coord_traverse_ind;
1580 coord_dimension_range_sorted[coord_traverse_ind].val = best_range;
1581 coord_dimension_range_sorted[coord_traverse_ind].signbit = 1;
1585 uqSignsort(this->coord_dim, p_coord_dimension_range_sorted);
1586 coordInd = p_coord_dimension_range_sorted[this->coord_dim - 1].
id;
1595 mj_current_dim_coords = this->mj_coordinates[coordInd];
1597 this->process_local_min_max_coord_total_weight[kk] = coord_dim_mins[coordInd];
1598 this->process_local_min_max_coord_total_weight[kk+ current_concurrent_num_parts] = coord_dim_maxs[coordInd];
1599 this->process_local_min_max_coord_total_weight[kk + 2*current_concurrent_num_parts] = best_weight_coord;
1603 this->mj_get_local_min_max_coord_totW(
1604 coordinate_begin_index,
1605 coordinate_end_index,
1606 this->coordinate_permutations,
1607 mj_current_dim_coords,
1608 this->process_local_min_max_coord_total_weight[kk],
1609 this->process_local_min_max_coord_total_weight[kk + current_concurrent_num_parts],
1610 this->process_local_min_max_coord_total_weight[kk + 2*current_concurrent_num_parts]
1616 if (actual_work_part_count > 0){
1618 this->mj_get_global_min_max_coord_totW(
1619 current_concurrent_num_parts,
1620 this->process_local_min_max_coord_total_weight,
1621 this->global_min_max_coord_total_weight);
1625 mj_part_t total_incomplete_cut_count = 0;
1630 mj_part_t concurrent_part_cut_shift = 0;
1631 mj_part_t concurrent_part_part_shift = 0;
1632 for(
int kk = 0; kk < current_concurrent_num_parts; ++kk){
1633 mj_scalar_t min_coordinate = this->global_min_max_coord_total_weight[kk];
1634 mj_scalar_t max_coordinate = this->global_min_max_coord_total_weight[kk +
1635 current_concurrent_num_parts];
1636 mj_scalar_t global_total_weight =
1637 this->global_min_max_coord_total_weight[kk +
1638 2 * current_concurrent_num_parts];
1640 mj_part_t concurrent_current_part_index = current_work_part + kk;
1642 mj_part_t partition_count = num_partitioning_in_current_dim[concurrent_current_part_index];
1644 mj_scalar_t *usedCutCoordinate = current_cut_coordinates + concurrent_part_cut_shift;
1645 mj_scalar_t *current_target_part_weights = this->target_part_weights +
1646 concurrent_part_part_shift;
1648 concurrent_part_cut_shift += partition_count - 1;
1650 concurrent_part_part_shift += partition_count;
1654 if(partition_count > 1 && min_coordinate <= max_coordinate){
1658 total_incomplete_cut_count += partition_count - 1;
1661 this->my_incomplete_cut_count[kk] = partition_count - 1;
1664 this->mj_get_initial_cut_coords_target_weights(
1667 partition_count - 1,
1668 global_total_weight,
1670 current_target_part_weights,
1671 future_num_part_in_parts,
1672 next_future_num_parts_in_parts,
1673 concurrent_current_part_index,
1674 obtained_part_index);
1676 mj_lno_t coordinate_end_index= this->part_xadj[concurrent_current_part_index];
1677 mj_lno_t coordinate_begin_index = concurrent_current_part_index==0 ? 0: this->part_xadj[concurrent_current_part_index -1];
1680 this->set_initial_coordinate_parts(
1683 concurrent_current_part_index,
1684 coordinate_begin_index, coordinate_end_index,
1685 this->coordinate_permutations,
1686 mj_current_dim_coords,
1687 this->assigned_part_ids,
1692 this->my_incomplete_cut_count[kk] = 0;
1694 obtained_part_index += partition_count;
1698 mj_scalar_t used_imbalance = 0;
1702 mj_current_dim_coords,
1705 current_concurrent_num_parts,
1706 current_cut_coordinates,
1707 total_incomplete_cut_count,
1708 num_partitioning_in_current_dim);
1714 mj_part_t output_array_shift = 0;
1715 mj_part_t cut_shift = 0;
1716 size_t tlr_shift = 0;
1717 size_t partweight_array_shift = 0;
1719 for(
int kk = 0; kk < current_concurrent_num_parts; ++kk){
1720 mj_part_t current_concurrent_work_part = current_work_part + kk;
1721 mj_part_t num_parts = num_partitioning_in_current_dim[current_concurrent_work_part];
1724 if((num_parts != 1 ) && this->global_min_max_coord_total_weight[kk] >
1725 this->global_min_max_coord_total_weight[kk + current_concurrent_num_parts]) {
1727 for(mj_part_t jj = 0; jj < num_parts; ++jj){
1728 this->new_part_xadj[output_part_index + output_array_shift + jj] = 0;
1730 cut_shift += num_parts - 1;
1731 tlr_shift += (4 *(num_parts - 1) + 1);
1732 output_array_shift += num_parts;
1733 partweight_array_shift += (2 * (num_parts - 1) + 1);
1737 mj_lno_t coordinate_end = this->part_xadj[current_concurrent_work_part];
1738 mj_lno_t coordinate_begin = current_concurrent_work_part==0 ? 0: this->part_xadj[current_concurrent_work_part
1740 mj_scalar_t *current_concurrent_cut_coordinate = current_cut_coordinates + cut_shift;
1741 mj_scalar_t *used_local_cut_line_weight_to_left = this->process_cut_line_weight_to_put_left +
1744 for(
int ii = 0; ii < this->num_threads; ++ii){
1745 this->thread_part_weight_work[ii] = this->thread_part_weights[ii] + partweight_array_shift;
1750 this->create_consistent_chunks(
1752 mj_current_dim_coords,
1753 current_concurrent_cut_coordinate,
1756 used_local_cut_line_weight_to_left,
1757 this->new_part_xadj + output_part_index + output_array_shift,
1763 mj_lno_t part_size = coordinate_end - coordinate_begin;
1764 *(this->new_part_xadj + output_part_index + output_array_shift) = part_size;
1765 memcpy(this->new_coordinate_permutations + coordinate_begin,
1766 this->coordinate_permutations + coordinate_begin,
1767 part_size *
sizeof(mj_lno_t));
1772 cut_shift += num_parts - 1;
1773 tlr_shift += (4 *(num_parts - 1) + 1);
1774 output_array_shift += num_parts;
1775 partweight_array_shift += (2 * (num_parts - 1) + 1);
1784 for(mj_part_t kk = 0; kk < current_concurrent_num_parts; ++kk){
1785 mj_part_t num_parts = num_partitioning_in_current_dim[ current_work_part + kk];
1786 for (mj_part_t ii = 0;ii < num_parts ; ++ii){
1788 this->new_part_xadj[output_part_index+ii] += output_coordinate_end_index;
1790 mj_lno_t coordinate_end = this->new_part_xadj[output_part_index+ii];
1791 mj_lno_t coordinate_begin = this->new_part_xadj[output_part_index];
1793 for (mj_lno_t task_traverse = coordinate_begin; task_traverse < coordinate_end; ++task_traverse){
1794 mj_lno_t l = this->new_coordinate_permutations[task_traverse];
1795 mj_current_dim_coords[l] = -mj_current_dim_coords[l];
1800 output_coordinate_end_index = this->new_part_xadj[output_part_index + num_parts - 1];
1802 output_part_index += num_parts ;
1809 current_num_parts = output_part_count_in_dimension;
1812 mj_lno_t * tmp = this->coordinate_permutations;
1813 this->coordinate_permutations = this->new_coordinate_permutations;
1814 this->new_coordinate_permutations = tmp;
1816 freeArray<mj_lno_t>(this->part_xadj);
1817 this->part_xadj = this->new_part_xadj;
1818 this->new_part_xadj = NULL;
1821 for(mj_lno_t i = 0; i < num_total_coords; ++i){
1822 inital_adjList_output_adjlist[i] = this->coordinate_permutations[i];
1827 for(
size_t i = 0; i < this->num_global_parts ; ++i){
1828 output_xadj[i+1] = this->part_xadj[i];
1831 delete future_num_part_in_parts;
1832 delete next_future_num_parts_in_parts;
1835 freeArray<mj_part_t>(this->assigned_part_ids);
1836 freeArray<mj_gno_t>(this->initial_mj_gnos);
1837 freeArray<mj_gno_t>(this->current_mj_gnos);
1838 freeArray<bool>(tmp_mj_uniform_weights);
1839 freeArray<bool>(tmp_mj_uniform_parts);
1840 freeArray<mj_scalar_t *>(tmp_mj_weights);
1841 freeArray<mj_scalar_t *>(tmp_mj_part_sizes);
1843 this->free_work_memory();
1845 #ifdef HAVE_ZOLTAN2_OMP 1846 omp_set_num_threads(actual_num_threads);
1853 template <
typename mj_scalar_t,
typename mj_lno_t,
typename mj_gno_t,
1856 mj_env(), mj_problemComm(), imbalance_tolerance(0),
1857 part_no_array(NULL), recursion_depth(0), coord_dim(0),
1858 num_weights_per_coord(0), initial_num_loc_coords(0),
1859 initial_num_glob_coords(0),
1860 num_local_coords(0), num_global_coords(0), mj_coordinates(NULL),
1861 mj_weights(NULL), mj_uniform_parts(NULL), mj_part_sizes(NULL),
1862 mj_uniform_weights(NULL), mj_gnos(), num_global_parts(1),
1863 initial_mj_gnos(NULL), current_mj_gnos(NULL), owner_of_coordinate(NULL),
1864 coordinate_permutations(NULL), new_coordinate_permutations(NULL),
1865 assigned_part_ids(NULL), part_xadj(NULL), new_part_xadj(NULL),
1866 distribute_points_on_cut_lines(true), max_concurrent_part_calculation(1),
1867 mj_run_as_rcb(false), mj_user_recursion_depth(0), mj_keep_part_boxes(false),
1868 check_migrate_avoid_migration_option(0), minimum_migration_imbalance(0.30),
1869 num_threads(1), total_num_cut(0), total_num_part(0), max_num_part_along_dim(0),
1870 max_num_cut_along_dim(0), max_num_total_part_along_dim(0), total_dim_num_reduce_all(0),
1871 last_dim_num_part(0), comm(), fEpsilon(0), sEpsilon(0), maxScalar_t(0), minScalar_t(0),
1872 all_cut_coordinates(NULL), max_min_coords(NULL), process_cut_line_weight_to_put_left(NULL),
1873 thread_cut_line_weight_to_put_left(NULL), cut_coordinates_work_array(NULL),
1874 target_part_weights(NULL), cut_upper_bound_coordinates(NULL), cut_lower_bound_coordinates(NULL),
1875 cut_lower_bound_weights(NULL), cut_upper_bound_weights(NULL),
1876 process_local_min_max_coord_total_weight(NULL), global_min_max_coord_total_weight(NULL),
1877 is_cut_line_determined(NULL), my_incomplete_cut_count(NULL),
1878 thread_part_weights(NULL), thread_part_weight_work(NULL),
1879 thread_cut_left_closest_point(NULL), thread_cut_right_closest_point(NULL),
1880 thread_point_counts(NULL), process_rectilinear_cut_weight(NULL),
1881 global_rectilinear_cut_weight(NULL),total_part_weight_left_right_closests(NULL),
1882 global_total_part_weight_left_right_closests(NULL),
1883 kept_boxes(),global_box(),
1884 myRank(0), myActualRank(0)
1889 this->maxScalar_t = std::numeric_limits<mj_scalar_t>::max();
1890 this->minScalar_t = -std::numeric_limits<mj_scalar_t>::max();
1898 template <
typename mj_scalar_t,
typename mj_lno_t,
typename mj_gno_t,
1900 RCP<typename AlgMJ<mj_scalar_t,mj_lno_t,mj_gno_t,mj_part_t>::mj_partBox_t>
1903 return this->global_box;
1909 template <
typename mj_scalar_t,
typename mj_lno_t,
typename mj_gno_t,
1912 this->mj_keep_part_boxes =
true;
1923 template <
typename mj_scalar_t,
typename mj_lno_t,
typename mj_gno_t,
1927 this->total_num_cut = 0;
1928 this->total_num_part = 1;
1929 this->max_num_part_along_dim = 0;
1930 this->total_dim_num_reduce_all = 0;
1931 this->last_dim_num_part = 1;
1934 this->max_num_cut_along_dim = 0;
1935 this->max_num_total_part_along_dim = 0;
1937 if (this->part_no_array){
1939 for (
int i = 0; i < this->recursion_depth; ++i){
1940 this->total_dim_num_reduce_all += this->total_num_part;
1941 this->total_num_part *= this->part_no_array[i];
1942 if(this->part_no_array[i] > this->max_num_part_along_dim) {
1943 this->max_num_part_along_dim = this->part_no_array[i];
1946 this->last_dim_num_part = this->total_num_part / this->part_no_array[recursion_depth-1];
1947 this->num_global_parts = this->total_num_part;
1949 mj_part_t future_num_parts = this->num_global_parts;
1952 for (
int i = 0; i < this->recursion_depth; ++i){
1954 mj_part_t maxNoPartAlongI = this->get_part_count(
1955 future_num_parts, 1.0f / (this->recursion_depth - i));
1957 if (maxNoPartAlongI > this->max_num_part_along_dim){
1958 this->max_num_part_along_dim = maxNoPartAlongI;
1961 mj_part_t nfutureNumParts = future_num_parts / maxNoPartAlongI;
1962 if (future_num_parts % maxNoPartAlongI){
1965 future_num_parts = nfutureNumParts;
1967 this->total_num_part = this->num_global_parts;
1971 for (
int i = 0; i < this->recursion_depth; ++i){
1972 this->total_dim_num_reduce_all += p;
1973 p *= this->max_num_part_along_dim;
1976 this->last_dim_num_part = p / this->max_num_part_along_dim;
1979 this->total_num_cut = this->total_num_part - 1;
1980 this->max_num_cut_along_dim = this->max_num_part_along_dim - 1;
1981 this->max_num_total_part_along_dim = this->max_num_part_along_dim + size_t(this->max_num_cut_along_dim);
1985 if(this->max_concurrent_part_calculation > this->last_dim_num_part){
1986 if(this->mj_problemComm->getRank() == 0){
1987 std::cerr <<
"Warning: Concurrent part count ("<< this->max_concurrent_part_calculation <<
1988 ") has been set bigger than maximum amount that can be used." <<
1989 " Setting to:" << this->last_dim_num_part <<
"." << std::endl;
1991 this->max_concurrent_part_calculation = this->last_dim_num_part;
2000 template <
typename mj_scalar_t,
typename mj_lno_t,
typename mj_gno_t,
2002 inline mj_part_t AlgMJ<mj_scalar_t, mj_lno_t, mj_gno_t, mj_part_t>::get_part_count(
2003 mj_part_t num_total_future,
2006 double fp = pow(num_total_future,
root);
2007 mj_part_t ip = mj_part_t (fp);
2008 if (fp - ip < this->fEpsilon * 100){
2030 template <
typename mj_scalar_t,
typename mj_lno_t,
typename mj_gno_t,
2032 mj_part_t AlgMJ<mj_scalar_t, mj_lno_t, mj_gno_t, mj_part_t>::update_part_num_arrays(
2033 std::vector <mj_part_t> &num_partitioning_in_current_dim,
2034 std::vector<mj_part_t> *future_num_part_in_parts,
2035 std::vector<mj_part_t> *next_future_num_parts_in_parts,
2036 mj_part_t &future_num_parts,
2037 mj_part_t current_num_parts,
2038 int current_iteration,
2039 RCP<mj_partBoxVector_t> input_part_boxes,
2040 RCP<mj_partBoxVector_t> output_part_boxes
2043 mj_part_t output_num_parts = 0;
2044 if(this->part_no_array){
2049 mj_part_t p = this->part_no_array[current_iteration];
2051 std::cout <<
"i:" << current_iteration <<
" p is given as:" << p << std::endl;
2055 return current_num_parts;
2058 for (mj_part_t ii = 0; ii < current_num_parts; ++ii){
2059 num_partitioning_in_current_dim.push_back(p);
2073 future_num_parts /= num_partitioning_in_current_dim[0];
2074 output_num_parts = current_num_parts * num_partitioning_in_current_dim[0];
2076 if (this->mj_keep_part_boxes){
2077 for (mj_part_t k = 0; k < current_num_parts; ++k){
2079 for (mj_part_t j = 0; j < num_partitioning_in_current_dim[0]; ++j){
2080 output_part_boxes->push_back((*input_part_boxes)[k]);
2088 for (mj_part_t ii = 0; ii < output_num_parts; ++ii){
2089 next_future_num_parts_in_parts->push_back(future_num_parts);
2099 future_num_parts = 1;
2103 for (mj_part_t ii = 0; ii < current_num_parts; ++ii){
2105 mj_part_t future_num_parts_of_part_ii = (*future_num_part_in_parts)[ii];
2109 mj_part_t num_partitions_in_current_dim =
2110 this->get_part_count(
2111 future_num_parts_of_part_ii,
2112 1.0 / (this->recursion_depth - current_iteration)
2115 if (num_partitions_in_current_dim > this->max_num_part_along_dim){
2116 std::cerr <<
"ERROR: maxPartNo calculation is wrong." << std::endl;
2120 num_partitioning_in_current_dim.push_back(num_partitions_in_current_dim);
2124 output_num_parts += num_partitions_in_current_dim;
2127 mj_part_t ideal_num_future_parts_in_part = future_num_parts_of_part_ii / num_partitions_in_current_dim;
2128 for (mj_part_t iii = 0; iii < num_partitions_in_current_dim; ++iii){
2129 mj_part_t num_future_parts_for_part_iii = ideal_num_future_parts_in_part;
2132 if (iii < future_num_parts_of_part_ii % num_partitions_in_current_dim){
2134 ++num_future_parts_for_part_iii;
2136 next_future_num_parts_in_parts->push_back(num_future_parts_for_part_iii);
2139 if (this->mj_keep_part_boxes){
2140 output_part_boxes->push_back((*input_part_boxes)[ii]);
2144 if (num_future_parts_for_part_iii > future_num_parts) future_num_parts = num_future_parts_for_part_iii;
2148 return output_num_parts;
2155 template <
typename mj_scalar_t,
typename mj_lno_t,
typename mj_gno_t,
2157 void AlgMJ<mj_scalar_t, mj_lno_t, mj_gno_t, mj_part_t>::allocate_set_work_memory(){
2160 this->owner_of_coordinate = NULL;
2165 this->coordinate_permutations = allocMemory< mj_lno_t>(this->num_local_coords);
2167 #ifdef HAVE_ZOLTAN2_OMP 2168 #pragma omp parallel for 2170 for(mj_lno_t i = 0; i < this->num_local_coords; ++i){
2171 this->coordinate_permutations[i] = i;
2175 this->new_coordinate_permutations = allocMemory< mj_lno_t>(this->num_local_coords);
2177 this->assigned_part_ids = NULL;
2178 if(this->num_local_coords > 0){
2179 this->assigned_part_ids = allocMemory<mj_part_t>(this->num_local_coords);
2185 this->part_xadj = allocMemory<mj_lno_t>(1);
2186 this->part_xadj[0] =
static_cast<mj_lno_t
>(this->num_local_coords);
2188 this->new_part_xadj = NULL;
2194 this->all_cut_coordinates = allocMemory< mj_scalar_t>(this->max_num_cut_along_dim * this->max_concurrent_part_calculation);
2196 this->max_min_coords = allocMemory< mj_scalar_t>(this->num_threads * 2);
2198 this->process_cut_line_weight_to_put_left = NULL;
2199 this->thread_cut_line_weight_to_put_left = NULL;
2201 if(this->distribute_points_on_cut_lines){
2202 this->process_cut_line_weight_to_put_left = allocMemory<mj_scalar_t>(this->max_num_cut_along_dim * this->max_concurrent_part_calculation);
2203 this->thread_cut_line_weight_to_put_left = allocMemory<mj_scalar_t *>(this->num_threads);
2204 for(
int i = 0; i < this->num_threads; ++i){
2205 this->thread_cut_line_weight_to_put_left[i] = allocMemory<mj_scalar_t>(this->max_num_cut_along_dim);
2207 this->process_rectilinear_cut_weight = allocMemory<mj_scalar_t>(this->max_num_cut_along_dim);
2208 this->global_rectilinear_cut_weight = allocMemory<mj_scalar_t>(this->max_num_cut_along_dim);
2216 this->cut_coordinates_work_array = allocMemory<mj_scalar_t>(this->max_num_cut_along_dim *
2217 this->max_concurrent_part_calculation);
2221 this->target_part_weights = allocMemory<mj_scalar_t>(
2222 this->max_num_part_along_dim * this->max_concurrent_part_calculation);
2225 this->cut_upper_bound_coordinates = allocMemory<mj_scalar_t>(this->max_num_cut_along_dim * this->max_concurrent_part_calculation);
2226 this->cut_lower_bound_coordinates = allocMemory<mj_scalar_t>(this->max_num_cut_along_dim* this->max_concurrent_part_calculation);
2227 this->cut_lower_bound_weights = allocMemory<mj_scalar_t>(this->max_num_cut_along_dim* this->max_concurrent_part_calculation);
2228 this->cut_upper_bound_weights = allocMemory<mj_scalar_t>(this->max_num_cut_along_dim* this->max_concurrent_part_calculation);
2230 this->process_local_min_max_coord_total_weight = allocMemory<mj_scalar_t>(3 * this->max_concurrent_part_calculation);
2231 this->global_min_max_coord_total_weight = allocMemory<mj_scalar_t>(3 * this->max_concurrent_part_calculation);
2235 this->is_cut_line_determined = allocMemory<bool>(this->max_num_cut_along_dim * this->max_concurrent_part_calculation);
2238 this->my_incomplete_cut_count = allocMemory<mj_part_t>(this->max_concurrent_part_calculation);
2240 this->thread_part_weights = allocMemory<double *>(this->num_threads);
2242 this->thread_part_weight_work = allocMemory<double *>(this->num_threads);
2245 this->thread_cut_left_closest_point = allocMemory<mj_scalar_t *>(this->num_threads);
2247 this->thread_cut_right_closest_point = allocMemory<mj_scalar_t *>(this->num_threads);
2250 this->thread_point_counts = allocMemory<mj_lno_t *>(this->num_threads);
2252 for(
int i = 0; i < this->num_threads; ++i){
2254 this->thread_part_weights[i] = allocMemory < double >(this->max_num_total_part_along_dim * this->max_concurrent_part_calculation);
2255 this->thread_cut_right_closest_point[i] = allocMemory<mj_scalar_t>(this->max_num_cut_along_dim * this->max_concurrent_part_calculation);
2256 this->thread_cut_left_closest_point[i] = allocMemory<mj_scalar_t>(this->max_num_cut_along_dim * this->max_concurrent_part_calculation);
2257 this->thread_point_counts[i] = allocMemory<mj_lno_t>(this->max_num_part_along_dim);
2263 this->total_part_weight_left_right_closests = allocMemory<mj_scalar_t>((this->max_num_total_part_along_dim + this->max_num_cut_along_dim * 2) * this->max_concurrent_part_calculation);
2264 this->global_total_part_weight_left_right_closests = allocMemory<mj_scalar_t>((this->max_num_total_part_along_dim + this->max_num_cut_along_dim * 2) * this->max_concurrent_part_calculation);
2267 mj_scalar_t **coord = allocMemory<mj_scalar_t *>(this->coord_dim);
2268 for (
int i=0; i < this->coord_dim; i++){
2269 coord[i] = allocMemory<mj_scalar_t>(this->num_local_coords);
2270 #ifdef HAVE_ZOLTAN2_OMP 2271 #pragma omp parallel for 2273 for (mj_lno_t j=0; j < this->num_local_coords; j++)
2274 coord[i][j] = this->mj_coordinates[i][j];
2276 this->mj_coordinates = coord;
2279 int criteria_dim = (this->num_weights_per_coord ? this->num_weights_per_coord : 1);
2280 mj_scalar_t **
weights = allocMemory<mj_scalar_t *>(criteria_dim);
2282 for (
int i=0; i < criteria_dim; i++){
2285 for (
int i=0; i < this->num_weights_per_coord; i++){
2286 weights[i] = allocMemory<mj_scalar_t>(this->num_local_coords);
2287 #ifdef HAVE_ZOLTAN2_OMP 2288 #pragma omp parallel for 2290 for (mj_lno_t j=0; j < this->num_local_coords; j++)
2291 weights[i][j] = this->mj_weights[i][j];
2295 this->current_mj_gnos = allocMemory<mj_gno_t>(this->num_local_coords);
2296 #ifdef HAVE_ZOLTAN2_OMP 2297 #pragma omp parallel for 2299 for (mj_lno_t j=0; j < this->num_local_coords; j++)
2300 this->current_mj_gnos[j] = this->initial_mj_gnos[j];
2302 this->owner_of_coordinate = allocMemory<int>(this->num_local_coords);
2304 #ifdef HAVE_ZOLTAN2_OMP 2305 #pragma omp parallel for 2307 for (mj_lno_t j=0; j < this->num_local_coords; j++)
2308 this->owner_of_coordinate[j] = this->myActualRank;
2313 template <
typename mj_scalar_t,
typename mj_lno_t,
typename mj_gno_t,
2315 void AlgMJ<mj_scalar_t,mj_lno_t,mj_gno_t,mj_part_t>::compute_global_box()
2318 mj_scalar_t *mins = allocMemory<mj_scalar_t>(this->coord_dim);
2320 mj_scalar_t *gmins = allocMemory<mj_scalar_t>(this->coord_dim);
2322 mj_scalar_t *maxs = allocMemory<mj_scalar_t>(this->coord_dim);
2324 mj_scalar_t *gmaxs = allocMemory<mj_scalar_t>(this->coord_dim);
2326 for (
int i = 0; i < this->coord_dim; ++i){
2327 mj_scalar_t localMin = std::numeric_limits<mj_scalar_t>::max();
2328 mj_scalar_t localMax = -localMin;
2329 if (localMax > 0) localMax = 0;
2332 for (mj_lno_t j = 0; j < this->num_local_coords; ++j){
2333 if (this->mj_coordinates[i][j] < localMin){
2334 localMin = this->mj_coordinates[i][j];
2336 if (this->mj_coordinates[i][j] > localMax){
2337 localMax = this->mj_coordinates[i][j];
2346 reduceAll<int, mj_scalar_t>(*this->comm, Teuchos::REDUCE_MIN,
2347 this->coord_dim, mins, gmins
2351 reduceAll<int, mj_scalar_t>(*this->comm, Teuchos::REDUCE_MAX,
2352 this->coord_dim, maxs, gmaxs
2358 global_box = rcp(
new mj_partBox_t(0,this->coord_dim,gmins,gmaxs));
2360 freeArray<mj_scalar_t>(mins);
2361 freeArray<mj_scalar_t>(gmins);
2362 freeArray<mj_scalar_t>(maxs);
2363 freeArray<mj_scalar_t>(gmaxs);
2371 template <
typename mj_scalar_t,
typename mj_lno_t,
typename mj_gno_t,
2373 void AlgMJ<mj_scalar_t, mj_lno_t, mj_gno_t, mj_part_t>::init_part_boxes(
2374 RCP<mj_partBoxVector_t> & initial_partitioning_boxes
2377 mj_partBox_t tmp_box(*global_box);
2378 initial_partitioning_boxes->push_back(tmp_box);
2391 template <
typename mj_scalar_t,
typename mj_lno_t,
typename mj_gno_t,
2393 void AlgMJ<mj_scalar_t, mj_lno_t, mj_gno_t, mj_part_t>::mj_get_local_min_max_coord_totW(
2394 mj_lno_t coordinate_begin_index,
2395 mj_lno_t coordinate_end_index,
2396 mj_lno_t *mj_current_coordinate_permutations,
2397 mj_scalar_t *mj_current_dim_coords,
2398 mj_scalar_t &min_coordinate,
2399 mj_scalar_t &max_coordinate,
2400 mj_scalar_t &total_weight){
2404 if(coordinate_begin_index >= coordinate_end_index)
2406 min_coordinate = this->maxScalar_t;
2407 max_coordinate = this->minScalar_t;
2411 mj_scalar_t my_total_weight = 0;
2412 #ifdef HAVE_ZOLTAN2_OMP 2413 #pragma omp parallel 2417 if (this->mj_uniform_weights[0]) {
2418 #ifdef HAVE_ZOLTAN2_OMP 2422 my_total_weight = coordinate_end_index - coordinate_begin_index;
2428 #ifdef HAVE_ZOLTAN2_OMP 2429 #pragma omp for reduction(+:my_total_weight) 2431 for (mj_lno_t ii = coordinate_begin_index; ii < coordinate_end_index; ++ii){
2432 int i = mj_current_coordinate_permutations[ii];
2433 my_total_weight += this->mj_weights[0][i];
2437 int my_thread_id = 0;
2438 #ifdef HAVE_ZOLTAN2_OMP 2439 my_thread_id = omp_get_thread_num();
2441 mj_scalar_t my_thread_min_coord, my_thread_max_coord;
2442 my_thread_min_coord=my_thread_max_coord
2443 =mj_current_dim_coords[mj_current_coordinate_permutations[coordinate_begin_index]];
2446 #ifdef HAVE_ZOLTAN2_OMP 2449 for(mj_lno_t j = coordinate_begin_index + 1; j < coordinate_end_index; ++j){
2450 int i = mj_current_coordinate_permutations[j];
2451 if(mj_current_dim_coords[i] > my_thread_max_coord)
2452 my_thread_max_coord = mj_current_dim_coords[i];
2453 if(mj_current_dim_coords[i] < my_thread_min_coord)
2454 my_thread_min_coord = mj_current_dim_coords[i];
2456 this->max_min_coords[my_thread_id] = my_thread_min_coord;
2457 this->max_min_coords[my_thread_id + this->num_threads] = my_thread_max_coord;
2459 #ifdef HAVE_ZOLTAN2_OMP 2462 #pragma omp single nowait 2465 min_coordinate = this->max_min_coords[0];
2466 for(
int i = 1; i < this->num_threads; ++i){
2467 if(this->max_min_coords[i] < min_coordinate)
2468 min_coordinate = this->max_min_coords[i];
2472 #ifdef HAVE_ZOLTAN2_OMP 2473 #pragma omp single nowait 2476 max_coordinate = this->max_min_coords[this->num_threads];
2477 for(
int i = this->num_threads + 1; i < this->num_threads * 2; ++i){
2478 if(this->max_min_coords[i] > max_coordinate)
2479 max_coordinate = this->max_min_coords[i];
2483 total_weight = my_total_weight;
2495 template <
typename mj_scalar_t,
typename mj_lno_t,
typename mj_gno_t,
2497 void AlgMJ<mj_scalar_t, mj_lno_t, mj_gno_t, mj_part_t>::mj_get_global_min_max_coord_totW(
2498 mj_part_t current_concurrent_num_parts,
2499 mj_scalar_t *local_min_max_total,
2500 mj_scalar_t *global_min_max_total){
2505 if(this->comm->getSize() > 1){
2508 current_concurrent_num_parts,
2509 current_concurrent_num_parts,
2510 current_concurrent_num_parts);
2512 reduceAll<int, mj_scalar_t>(
2515 3 * current_concurrent_num_parts,
2516 local_min_max_total,
2517 global_min_max_total);
2522 mj_part_t s = 3 * current_concurrent_num_parts;
2523 for (mj_part_t i = 0; i < s; ++i){
2524 global_min_max_total[i] = local_min_max_total[i];
2549 template <
typename mj_scalar_t,
typename mj_lno_t,
typename mj_gno_t,
2551 void AlgMJ<mj_scalar_t, mj_lno_t, mj_gno_t, mj_part_t>::mj_get_initial_cut_coords_target_weights(
2552 mj_scalar_t min_coord,
2553 mj_scalar_t max_coord,
2554 mj_part_t num_cuts ,
2555 mj_scalar_t global_weight,
2556 mj_scalar_t *initial_cut_coords ,
2557 mj_scalar_t *current_target_part_weights ,
2559 std::vector <mj_part_t> *future_num_part_in_parts,
2560 std::vector <mj_part_t> *next_future_num_parts_in_parts,
2561 mj_part_t concurrent_current_part,
2562 mj_part_t obtained_part_index
2565 mj_scalar_t coord_range = max_coord - min_coord;
2566 if(this->mj_uniform_parts[0]){
2568 mj_part_t cumulative = 0;
2570 mj_scalar_t total_future_part_count_in_part = mj_scalar_t((*future_num_part_in_parts)[concurrent_current_part]);
2574 mj_scalar_t unit_part_weight = global_weight / total_future_part_count_in_part;
2580 for(mj_part_t i = 0; i < num_cuts; ++i){
2581 cumulative += (*next_future_num_parts_in_parts)[i + obtained_part_index];
2589 current_target_part_weights[i] = cumulative * unit_part_weight;
2592 initial_cut_coords[i] = min_coord + (coord_range *
2593 cumulative) / total_future_part_count_in_part;
2595 current_target_part_weights[num_cuts] = 1;
2599 if (this->mj_uniform_weights[0]){
2600 for(mj_part_t i = 0; i < num_cuts + 1; ++i){
2601 current_target_part_weights[i] = long(current_target_part_weights[i] + 0.5);
2606 std::cerr <<
"MJ does not support non uniform part weights" << std::endl;
2624 template <
typename mj_scalar_t,
typename mj_lno_t,
typename mj_gno_t,
2626 void AlgMJ<mj_scalar_t, mj_lno_t, mj_gno_t, mj_part_t>::set_initial_coordinate_parts(
2627 mj_scalar_t &max_coordinate,
2628 mj_scalar_t &min_coordinate,
2629 mj_part_t &concurrent_current_part_index,
2630 mj_lno_t coordinate_begin_index,
2631 mj_lno_t coordinate_end_index,
2632 mj_lno_t *mj_current_coordinate_permutations,
2633 mj_scalar_t *mj_current_dim_coords,
2634 mj_part_t *mj_part_ids,
2635 mj_part_t &partition_count
2637 mj_scalar_t coordinate_range = max_coordinate - min_coordinate;
2641 if(
ZOLTAN2_ABS(coordinate_range) < this->sEpsilon ){
2642 #ifdef HAVE_ZOLTAN2_OMP 2643 #pragma omp parallel for 2645 for(mj_lno_t ii = coordinate_begin_index; ii < coordinate_end_index; ++ii){
2646 mj_part_ids[mj_current_coordinate_permutations[ii]] = 0;
2653 mj_scalar_t slice = coordinate_range / partition_count;
2655 #ifdef HAVE_ZOLTAN2_OMP 2656 #pragma omp parallel for 2658 for(mj_lno_t ii = coordinate_begin_index; ii < coordinate_end_index; ++ii){
2660 mj_lno_t iii = mj_current_coordinate_permutations[ii];
2661 mj_part_t pp = mj_part_t((mj_current_dim_coords[iii] - min_coordinate) / slice);
2662 mj_part_ids[iii] = 2 * pp;
2678 template <
typename mj_scalar_t,
typename mj_lno_t,
typename mj_gno_t,
2680 void AlgMJ<mj_scalar_t, mj_lno_t, mj_gno_t, mj_part_t>::mj_1D_part(
2681 mj_scalar_t *mj_current_dim_coords,
2682 mj_scalar_t used_imbalance_tolerance,
2683 mj_part_t current_work_part,
2684 mj_part_t current_concurrent_num_parts,
2685 mj_scalar_t *current_cut_coordinates,
2686 mj_part_t total_incomplete_cut_count,
2687 std::vector <mj_part_t> &num_partitioning_in_current_dim
2691 mj_part_t rectilinear_cut_count = 0;
2692 mj_scalar_t *temp_cut_coords = current_cut_coordinates;
2695 *reductionOp = NULL;
2697 <mj_part_t, mj_scalar_t>(
2698 &num_partitioning_in_current_dim ,
2700 current_concurrent_num_parts);
2702 size_t total_reduction_size = 0;
2703 #ifdef HAVE_ZOLTAN2_OMP 2704 #pragma omp parallel shared(total_incomplete_cut_count, rectilinear_cut_count) 2708 #ifdef HAVE_ZOLTAN2_OMP 2709 me = omp_get_thread_num();
2711 double *my_thread_part_weights = this->thread_part_weights[me];
2712 mj_scalar_t *my_thread_left_closest = this->thread_cut_left_closest_point[me];
2713 mj_scalar_t *my_thread_right_closest = this->thread_cut_right_closest_point[me];
2715 #ifdef HAVE_ZOLTAN2_OMP 2721 for(mj_part_t i = 0; i < current_concurrent_num_parts; ++i){
2723 mj_part_t num_part_in_dim = num_partitioning_in_current_dim[current_work_part + i];
2724 mj_part_t num_cut_in_dim = num_part_in_dim - 1;
2725 total_reduction_size += (4 * num_cut_in_dim + 1);
2727 for(mj_part_t ii = 0; ii < num_cut_in_dim; ++ii){
2728 this->is_cut_line_determined[next] =
false;
2729 this->cut_lower_bound_coordinates[next] = global_min_max_coord_total_weight[i];
2730 this->cut_upper_bound_coordinates[next] = global_min_max_coord_total_weight[i + current_concurrent_num_parts];
2732 this->cut_upper_bound_weights[next] = global_min_max_coord_total_weight[i + 2 * current_concurrent_num_parts];
2733 this->cut_lower_bound_weights[next] = 0;
2735 if(this->distribute_points_on_cut_lines){
2736 this->process_cut_line_weight_to_put_left[next] = 0;
2747 while (total_incomplete_cut_count != 0){
2750 mj_part_t concurrent_cut_shifts = 0;
2751 size_t total_part_shift = 0;
2753 for (mj_part_t kk = 0; kk < current_concurrent_num_parts; ++kk){
2754 mj_part_t num_parts = -1;
2755 num_parts = num_partitioning_in_current_dim[current_work_part + kk];
2757 mj_part_t num_cuts = num_parts - 1;
2758 size_t total_part_count = num_parts + size_t (num_cuts) ;
2759 if (this->my_incomplete_cut_count[kk] > 0){
2762 bool *current_cut_status = this->is_cut_line_determined + concurrent_cut_shifts;
2763 double *my_current_part_weights = my_thread_part_weights + total_part_shift;
2764 mj_scalar_t *my_current_left_closest = my_thread_left_closest + concurrent_cut_shifts;
2765 mj_scalar_t *my_current_right_closest = my_thread_right_closest + concurrent_cut_shifts;
2767 mj_part_t conccurent_current_part = current_work_part + kk;
2768 mj_lno_t coordinate_begin_index = conccurent_current_part ==0 ? 0: this->part_xadj[conccurent_current_part -1];
2769 mj_lno_t coordinate_end_index = this->part_xadj[conccurent_current_part];
2770 mj_scalar_t *temp_current_cut_coords = temp_cut_coords + concurrent_cut_shifts;
2772 mj_scalar_t min_coord = global_min_max_coord_total_weight[kk];
2773 mj_scalar_t max_coord = global_min_max_coord_total_weight[kk + current_concurrent_num_parts];
2776 this->mj_1D_part_get_thread_part_weights(
2781 coordinate_begin_index,
2782 coordinate_end_index,
2783 mj_current_dim_coords,
2784 temp_current_cut_coords,
2786 my_current_part_weights,
2787 my_current_left_closest,
2788 my_current_right_closest);
2792 concurrent_cut_shifts += num_cuts;
2793 total_part_shift += total_part_count;
2797 this->mj_accumulate_thread_results(
2798 num_partitioning_in_current_dim,
2800 current_concurrent_num_parts);
2803 #ifdef HAVE_ZOLTAN2_OMP 2807 if(this->comm->getSize() > 1){
2808 reduceAll<int, mj_scalar_t>( *(this->comm), *reductionOp,
2809 total_reduction_size,
2810 this->total_part_weight_left_right_closests,
2811 this->global_total_part_weight_left_right_closests);
2816 this->global_total_part_weight_left_right_closests,
2817 this->total_part_weight_left_right_closests,
2818 total_reduction_size *
sizeof(mj_scalar_t));
2823 mj_part_t cut_shift = 0;
2826 size_t tlr_shift = 0;
2827 for (mj_part_t kk = 0; kk < current_concurrent_num_parts; ++kk){
2828 mj_part_t num_parts = num_partitioning_in_current_dim[current_work_part + kk];
2829 mj_part_t num_cuts = num_parts - 1;
2830 size_t num_total_part = num_parts + size_t (num_cuts) ;
2835 if (this->my_incomplete_cut_count[kk] == 0) {
2836 cut_shift += num_cuts;
2837 tlr_shift += (num_total_part + 2 * num_cuts);
2841 mj_scalar_t *current_local_part_weights = this->total_part_weight_left_right_closests + tlr_shift ;
2842 mj_scalar_t *current_global_tlr = this->global_total_part_weight_left_right_closests + tlr_shift;
2843 mj_scalar_t *current_global_left_closest_points = current_global_tlr + num_total_part;
2844 mj_scalar_t *current_global_right_closest_points = current_global_tlr + num_total_part + num_cuts;
2845 mj_scalar_t *current_global_part_weights = current_global_tlr;
2846 bool *current_cut_line_determined = this->is_cut_line_determined + cut_shift;
2848 mj_scalar_t *current_part_target_weights = this->target_part_weights + cut_shift + kk;
2849 mj_scalar_t *current_part_cut_line_weight_to_put_left = this->process_cut_line_weight_to_put_left + cut_shift;
2851 mj_scalar_t min_coordinate = global_min_max_coord_total_weight[kk];
2852 mj_scalar_t max_coordinate = global_min_max_coord_total_weight[kk + current_concurrent_num_parts];
2853 mj_scalar_t global_total_weight = global_min_max_coord_total_weight[kk + current_concurrent_num_parts * 2];
2854 mj_scalar_t *current_cut_lower_bound_weights = this->cut_lower_bound_weights + cut_shift;
2855 mj_scalar_t *current_cut_upper_weights = this->cut_upper_bound_weights + cut_shift;
2856 mj_scalar_t *current_cut_upper_bounds = this->cut_upper_bound_coordinates + cut_shift;
2857 mj_scalar_t *current_cut_lower_bounds = this->cut_lower_bound_coordinates + cut_shift;
2859 mj_part_t initial_incomplete_cut_count = this->my_incomplete_cut_count[kk];
2862 this->mj_get_new_cut_coordinates(
2867 global_total_weight,
2868 used_imbalance_tolerance,
2869 current_global_part_weights,
2870 current_local_part_weights,
2871 current_part_target_weights,
2872 current_cut_line_determined,
2873 temp_cut_coords + cut_shift,
2874 current_cut_upper_bounds,
2875 current_cut_lower_bounds,
2876 current_global_left_closest_points,
2877 current_global_right_closest_points,
2878 current_cut_lower_bound_weights,
2879 current_cut_upper_weights,
2880 this->cut_coordinates_work_array +cut_shift,
2881 current_part_cut_line_weight_to_put_left,
2882 &rectilinear_cut_count,
2883 this->my_incomplete_cut_count[kk]);
2885 cut_shift += num_cuts;
2886 tlr_shift += (num_total_part + 2 * num_cuts);
2887 mj_part_t iteration_complete_cut_count = initial_incomplete_cut_count - this->my_incomplete_cut_count[kk];
2888 #ifdef HAVE_ZOLTAN2_OMP 2892 total_incomplete_cut_count -= iteration_complete_cut_count;
2897 #ifdef HAVE_ZOLTAN2_OMP 2903 mj_scalar_t *t = temp_cut_coords;
2904 temp_cut_coords = this->cut_coordinates_work_array;
2905 this->cut_coordinates_work_array = t;
2914 if (current_cut_coordinates != temp_cut_coords){
2915 #ifdef HAVE_ZOLTAN2_OMP 2920 for(mj_part_t i = 0; i < current_concurrent_num_parts; ++i){
2921 mj_part_t num_parts = -1;
2922 num_parts = num_partitioning_in_current_dim[current_work_part + i];
2923 mj_part_t num_cuts = num_parts - 1;
2925 for(mj_part_t ii = 0; ii < num_cuts; ++ii){
2926 current_cut_coordinates[next + ii] = temp_cut_coords[next + ii];
2932 #ifdef HAVE_ZOLTAN2_OMP 2936 this->cut_coordinates_work_array = temp_cut_coords;
2963 template <
typename mj_scalar_t,
typename mj_lno_t,
typename mj_gno_t,
2965 void AlgMJ<mj_scalar_t, mj_lno_t, mj_gno_t, mj_part_t>::mj_1D_part_get_thread_part_weights(
2966 size_t total_part_count,
2968 mj_scalar_t max_coord,
2969 mj_scalar_t min_coord,
2970 mj_lno_t coordinate_begin_index,
2971 mj_lno_t coordinate_end_index,
2972 mj_scalar_t *mj_current_dim_coords,
2973 mj_scalar_t *temp_current_cut_coords,
2974 bool *current_cut_status,
2975 double *my_current_part_weights,
2976 mj_scalar_t *my_current_left_closest,
2977 mj_scalar_t *my_current_right_closest){
2980 for (
size_t i = 0; i < total_part_count; ++i){
2981 my_current_part_weights[i] = 0;
2986 for(mj_part_t i = 0; i < num_cuts; ++i){
2987 my_current_left_closest[i] = min_coord - 1;
2988 my_current_right_closest[i] = max_coord + 1;
2991 mj_scalar_t minus_EPSILON = -this->sEpsilon;
2992 #ifdef HAVE_ZOLTAN2_OMP 2998 for (mj_lno_t ii = coordinate_begin_index; ii < coordinate_end_index; ++ii){
2999 int i = this->coordinate_permutations[ii];
3003 mj_part_t j = this->assigned_part_ids[i] / 2;
3009 mj_part_t lower_cut_index = 0;
3010 mj_part_t upper_cut_index = num_cuts - 1;
3012 mj_scalar_t w = this->mj_uniform_weights[0]? 1:this->mj_weights[0][i];
3013 bool is_inserted =
false;
3014 bool is_on_left_of_cut =
false;
3015 bool is_on_right_of_cut =
false;
3016 mj_part_t last_compared_part = -1;
3018 mj_scalar_t coord = mj_current_dim_coords[i];
3020 while(upper_cut_index >= lower_cut_index)
3023 last_compared_part = -1;
3024 is_on_left_of_cut =
false;
3025 is_on_right_of_cut =
false;
3026 mj_scalar_t cut = temp_current_cut_coords[j];
3027 mj_scalar_t distance_to_cut = coord - cut;
3028 mj_scalar_t abs_distance_to_cut =
ZOLTAN2_ABS(distance_to_cut);
3031 if(abs_distance_to_cut < this->sEpsilon){
3033 my_current_part_weights[j * 2 + 1] += w;
3034 this->assigned_part_ids[i] = j * 2 + 1;
3037 my_current_left_closest[j] = coord;
3038 my_current_right_closest[j] = coord;
3041 mj_part_t kk = j + 1;
3042 while(kk < num_cuts){
3044 distance_to_cut =
ZOLTAN2_ABS(temp_current_cut_coords[kk] - cut);
3045 if(distance_to_cut < this->sEpsilon){
3046 my_current_part_weights[2 * kk + 1] += w;
3047 my_current_left_closest[kk] = coord;
3048 my_current_right_closest[kk] = coord;
3054 if(coord - my_current_left_closest[kk] > this->sEpsilon){
3055 my_current_left_closest[kk] = coord;
3065 distance_to_cut =
ZOLTAN2_ABS(temp_current_cut_coords[kk] - cut);
3066 if(distance_to_cut < this->sEpsilon){
3067 my_current_part_weights[2 * kk + 1] += w;
3069 this->assigned_part_ids[i] = kk * 2 + 1;
3070 my_current_left_closest[kk] = coord;
3071 my_current_right_closest[kk] = coord;
3077 if(my_current_right_closest[kk] - coord > this->sEpsilon){
3078 my_current_right_closest[kk] = coord;
3089 if (distance_to_cut < 0) {
3090 bool _break =
false;
3094 mj_scalar_t distance_to_next_cut = coord - temp_current_cut_coords[j - 1];
3095 if(distance_to_next_cut > this->sEpsilon){
3101 upper_cut_index = j - 1;
3103 is_on_left_of_cut =
true;
3104 last_compared_part = j;
3109 bool _break =
false;
3110 if(j < num_cuts - 1){
3113 mj_scalar_t distance_to_next_cut = coord - temp_current_cut_coords[j + 1];
3114 if(distance_to_next_cut < minus_EPSILON){
3121 lower_cut_index = j + 1;
3123 is_on_right_of_cut =
true;
3124 last_compared_part = j;
3129 j = (upper_cut_index + lower_cut_index) / 2;
3132 if(is_on_right_of_cut){
3135 my_current_part_weights[2 * last_compared_part + 2] += w;
3136 this->assigned_part_ids[i] = 2 * last_compared_part + 2;
3139 if(my_current_right_closest[last_compared_part] - coord > this->sEpsilon){
3140 my_current_right_closest[last_compared_part] = coord;
3143 if(last_compared_part+1 < num_cuts){
3145 if(coord - my_current_left_closest[last_compared_part + 1] > this->sEpsilon){
3146 my_current_left_closest[last_compared_part + 1] = coord;
3151 else if(is_on_left_of_cut){
3154 my_current_part_weights[2 * last_compared_part] += w;
3155 this->assigned_part_ids[i] = 2 * last_compared_part;
3159 if(coord - my_current_left_closest[last_compared_part] > this->sEpsilon){
3160 my_current_left_closest[last_compared_part] = coord;
3164 if(last_compared_part-1 >= 0){
3165 if(my_current_right_closest[last_compared_part -1] - coord > this->sEpsilon){
3166 my_current_right_closest[last_compared_part -1] = coord;
3175 for (
size_t i = 1; i < total_part_count; ++i){
3179 if(i % 2 == 0 && i > 1 && i < total_part_count - 1 &&
3180 ZOLTAN2_ABS(temp_current_cut_coords[i / 2] - temp_current_cut_coords[i /2 - 1])
3185 my_current_part_weights[i] = my_current_part_weights[i-2];
3189 my_current_part_weights[i] += my_current_part_weights[i-1];
3201 template <
typename mj_scalar_t,
typename mj_lno_t,
typename mj_gno_t,
3203 void AlgMJ<mj_scalar_t, mj_lno_t, mj_gno_t, mj_part_t>::mj_accumulate_thread_results(
3204 const std::vector <mj_part_t> &num_partitioning_in_current_dim,
3205 mj_part_t current_work_part,
3206 mj_part_t current_concurrent_num_parts){
3208 #ifdef HAVE_ZOLTAN2_OMP 3215 size_t tlr_array_shift = 0;
3216 mj_part_t cut_shift = 0;
3219 for(mj_part_t i = 0; i < current_concurrent_num_parts; ++i){
3221 mj_part_t num_parts_in_part = num_partitioning_in_current_dim[current_work_part + i];
3222 mj_part_t num_cuts_in_part = num_parts_in_part - 1;
3223 size_t num_total_part_in_part = num_parts_in_part + size_t (num_cuts_in_part) ;
3226 for(mj_part_t ii = 0; ii < num_cuts_in_part ; ++ii){
3227 mj_part_t next = tlr_array_shift + ii;
3228 mj_part_t cut_index = cut_shift + ii;
3229 if(this->is_cut_line_determined[cut_index])
continue;
3230 mj_scalar_t left_closest_in_process = this->thread_cut_left_closest_point[0][cut_index],
3231 right_closest_in_process = this->thread_cut_right_closest_point[0][cut_index];
3234 for (
int j = 1; j < this->num_threads; ++j){
3235 if (this->thread_cut_right_closest_point[j][cut_index] < right_closest_in_process ){
3236 right_closest_in_process = this->thread_cut_right_closest_point[j][cut_index];
3238 if (this->thread_cut_left_closest_point[j][cut_index] > left_closest_in_process ){
3239 left_closest_in_process = this->thread_cut_left_closest_point[j][cut_index];
3243 this->total_part_weight_left_right_closests[num_total_part_in_part +
3244 next] = left_closest_in_process;
3245 this->total_part_weight_left_right_closests[num_total_part_in_part +
3246 num_cuts_in_part + next] = right_closest_in_process;
3249 tlr_array_shift += (num_total_part_in_part + 2 * num_cuts_in_part);
3250 cut_shift += num_cuts_in_part;
3253 tlr_array_shift = 0;
3255 size_t total_part_array_shift = 0;
3258 for(mj_part_t i = 0; i < current_concurrent_num_parts; ++i){
3260 mj_part_t num_parts_in_part = num_partitioning_in_current_dim[current_work_part + i];
3261 mj_part_t num_cuts_in_part = num_parts_in_part - 1;
3262 size_t num_total_part_in_part = num_parts_in_part + size_t (num_cuts_in_part) ;
3264 for(
size_t j = 0; j < num_total_part_in_part; ++j){
3266 mj_part_t cut_ind = j / 2 + cut_shift;
3271 if(j != num_total_part_in_part - 1 && this->is_cut_line_determined[cut_ind])
continue;
3273 for (
int k = 0; k < this->num_threads; ++k){
3274 pwj += this->thread_part_weights[k][total_part_array_shift + j];
3277 this->total_part_weight_left_right_closests[tlr_array_shift + j] = pwj;
3279 cut_shift += num_cuts_in_part;
3280 tlr_array_shift += num_total_part_in_part + 2 * num_cuts_in_part;
3281 total_part_array_shift += num_total_part_in_part;
3299 template <
typename mj_scalar_t,
typename mj_lno_t,
typename mj_gno_t,
3301 void AlgMJ<mj_scalar_t, mj_lno_t, mj_gno_t, mj_part_t>::mj_calculate_new_cut_position (
3302 mj_scalar_t cut_upper_bound,
3303 mj_scalar_t cut_lower_bound,
3304 mj_scalar_t cut_upper_weight,
3305 mj_scalar_t cut_lower_weight,
3306 mj_scalar_t expected_weight,
3307 mj_scalar_t &new_cut_position){
3309 if(
ZOLTAN2_ABS(cut_upper_bound - cut_lower_bound) < this->sEpsilon){
3310 new_cut_position = cut_upper_bound;
3314 if(
ZOLTAN2_ABS(cut_upper_weight - cut_lower_weight) < this->sEpsilon){
3315 new_cut_position = cut_lower_bound;
3318 mj_scalar_t coordinate_range = (cut_upper_bound - cut_lower_bound);
3319 mj_scalar_t weight_range = (cut_upper_weight - cut_lower_weight);
3320 mj_scalar_t my_weight_diff = (expected_weight - cut_lower_weight);
3322 mj_scalar_t required_shift = (my_weight_diff / weight_range);
3323 int scale_constant = 20;
3324 int shiftint= int (required_shift * scale_constant);
3325 if (shiftint == 0) shiftint = 1;
3326 required_shift = mj_scalar_t (shiftint) / scale_constant;
3327 new_cut_position = coordinate_range * required_shift + cut_lower_bound;
3341 template <
typename mj_scalar_t,
typename mj_lno_t,
typename mj_gno_t,
3343 void AlgMJ<mj_scalar_t, mj_lno_t, mj_gno_t, mj_part_t>::mj_create_new_partitions(
3344 mj_part_t num_parts,
3345 mj_scalar_t *mj_current_dim_coords,
3346 mj_scalar_t *current_concurrent_cut_coordinate,
3347 mj_lno_t coordinate_begin,
3348 mj_lno_t coordinate_end,
3349 mj_scalar_t *used_local_cut_line_weight_to_left,
3350 double **used_thread_part_weight_work,
3351 mj_lno_t *out_part_xadj){
3353 mj_part_t num_cuts = num_parts - 1;
3355 #ifdef HAVE_ZOLTAN2_OMP 3356 #pragma omp parallel 3360 #ifdef HAVE_ZOLTAN2_OMP 3361 me = omp_get_thread_num();
3364 mj_lno_t *thread_num_points_in_parts = this->thread_point_counts[me];
3365 mj_scalar_t *my_local_thread_cut_weights_to_put_left = NULL;
3369 if (this->distribute_points_on_cut_lines){
3370 my_local_thread_cut_weights_to_put_left = this->thread_cut_line_weight_to_put_left[me];
3372 #ifdef HAVE_ZOLTAN2_OMP 3375 for (mj_part_t i = 0; i < num_cuts; ++i){
3377 mj_scalar_t left_weight = used_local_cut_line_weight_to_left[i];
3378 for(
int ii = 0; ii < this->num_threads; ++ii){
3379 if(left_weight > this->sEpsilon){
3381 mj_scalar_t thread_ii_weight_on_cut = used_thread_part_weight_work[ii][i * 2 + 1] - used_thread_part_weight_work[ii][i * 2 ];
3382 if(thread_ii_weight_on_cut < left_weight){
3384 this->thread_cut_line_weight_to_put_left[ii][i] = thread_ii_weight_on_cut;
3388 this->thread_cut_line_weight_to_put_left[ii][i] = left_weight ;
3390 left_weight -= thread_ii_weight_on_cut;
3393 this->thread_cut_line_weight_to_put_left[ii][i] = 0;
3401 for (mj_part_t i = num_cuts - 1; i > 0 ; --i){
3402 if(
ZOLTAN2_ABS(current_concurrent_cut_coordinate[i] - current_concurrent_cut_coordinate[i -1]) < this->sEpsilon){
3403 my_local_thread_cut_weights_to_put_left[i] -= my_local_thread_cut_weights_to_put_left[i - 1] ;
3411 for(mj_part_t ii = 0; ii < num_parts; ++ii){
3412 thread_num_points_in_parts[ii] = 0;
3416 #ifdef HAVE_ZOLTAN2_OMP 3420 for (mj_lno_t ii = coordinate_begin; ii < coordinate_end; ++ii){
3422 mj_lno_t coordinate_index = this->coordinate_permutations[ii];
3423 mj_scalar_t coordinate_weight = this->mj_uniform_weights[0]? 1:this->mj_weights[0][coordinate_index];
3424 mj_part_t coordinate_assigned_place = this->assigned_part_ids[coordinate_index];
3425 mj_part_t coordinate_assigned_part = coordinate_assigned_place / 2;
3426 if(coordinate_assigned_place % 2 == 1){
3428 if(this->distribute_points_on_cut_lines
3429 && my_local_thread_cut_weights_to_put_left[coordinate_assigned_part] > this->sEpsilon){
3433 my_local_thread_cut_weights_to_put_left[coordinate_assigned_part] -= coordinate_weight;
3438 if(my_local_thread_cut_weights_to_put_left[coordinate_assigned_part] < 0
3439 && coordinate_assigned_part < num_cuts - 1
3440 &&
ZOLTAN2_ABS(current_concurrent_cut_coordinate[coordinate_assigned_part+1] -
3441 current_concurrent_cut_coordinate[coordinate_assigned_part]) < this->sEpsilon){
3442 my_local_thread_cut_weights_to_put_left[coordinate_assigned_part + 1] += my_local_thread_cut_weights_to_put_left[coordinate_assigned_part];
3444 ++thread_num_points_in_parts[coordinate_assigned_part];
3445 this->assigned_part_ids[coordinate_index] = coordinate_assigned_part;
3449 ++coordinate_assigned_part;
3451 while(this->distribute_points_on_cut_lines &&
3452 coordinate_assigned_part < num_cuts){
3454 if(
ZOLTAN2_ABS(current_concurrent_cut_coordinate[coordinate_assigned_part] -
3455 current_concurrent_cut_coordinate[coordinate_assigned_part - 1])
3458 if(my_local_thread_cut_weights_to_put_left[coordinate_assigned_part] >
3460 my_local_thread_cut_weights_to_put_left[coordinate_assigned_part] >=
3461 ZOLTAN2_ABS(my_local_thread_cut_weights_to_put_left[coordinate_assigned_part] - coordinate_weight)){
3462 my_local_thread_cut_weights_to_put_left[coordinate_assigned_part] -= coordinate_weight;
3465 if(my_local_thread_cut_weights_to_put_left[coordinate_assigned_part] < 0 &&
3466 coordinate_assigned_part < num_cuts - 1 &&
3467 ZOLTAN2_ABS(current_concurrent_cut_coordinate[coordinate_assigned_part+1] -
3468 current_concurrent_cut_coordinate[coordinate_assigned_part]) < this->sEpsilon){
3469 my_local_thread_cut_weights_to_put_left[coordinate_assigned_part + 1] += my_local_thread_cut_weights_to_put_left[coordinate_assigned_part];
3477 ++coordinate_assigned_part;
3479 ++thread_num_points_in_parts[coordinate_assigned_part];
3480 this->assigned_part_ids[coordinate_index] = coordinate_assigned_part;
3485 ++thread_num_points_in_parts[coordinate_assigned_part];
3486 this->assigned_part_ids[coordinate_index] = coordinate_assigned_part;
3495 #ifdef HAVE_ZOLTAN2_OMP 3498 for(mj_part_t j = 0; j < num_parts; ++j){
3499 mj_lno_t num_points_in_part_j_upto_thread_i = 0;
3500 for (
int i = 0; i < this->num_threads; ++i){
3501 mj_lno_t thread_num_points_in_part_j = this->thread_point_counts[i][j];
3503 this->thread_point_counts[i][j] = num_points_in_part_j_upto_thread_i;
3504 num_points_in_part_j_upto_thread_i += thread_num_points_in_part_j;
3507 out_part_xadj[j] = num_points_in_part_j_upto_thread_i;
3511 #ifdef HAVE_ZOLTAN2_OMP 3516 for(mj_part_t j = 1; j < num_parts; ++j){
3517 out_part_xadj[j] += out_part_xadj[j - 1];
3523 for(mj_part_t j = 1; j < num_parts; ++j){
3524 thread_num_points_in_parts[j] += out_part_xadj[j - 1] ;
3530 #ifdef HAVE_ZOLTAN2_OMP 3533 for (mj_lno_t ii = coordinate_begin; ii < coordinate_end; ++ii){
3534 mj_lno_t i = this->coordinate_permutations[ii];
3535 mj_part_t p = this->assigned_part_ids[i];
3536 this->new_coordinate_permutations[coordinate_begin +
3537 thread_num_points_in_parts[p]++] = i;
3572 template <
typename mj_scalar_t,
typename mj_lno_t,
typename mj_gno_t,
3574 void AlgMJ<mj_scalar_t, mj_lno_t, mj_gno_t, mj_part_t>::mj_get_new_cut_coordinates(
3575 const size_t &num_total_part,
3576 const mj_part_t &num_cuts,
3577 const mj_scalar_t &max_coordinate,
3578 const mj_scalar_t &min_coordinate,
3579 const mj_scalar_t &global_total_weight,
3580 const mj_scalar_t &used_imbalance_tolerance,
3581 mj_scalar_t * current_global_part_weights,
3582 const mj_scalar_t * current_local_part_weights,
3583 const mj_scalar_t *current_part_target_weights,
3584 bool *current_cut_line_determined,
3585 mj_scalar_t *current_cut_coordinates,
3586 mj_scalar_t *current_cut_upper_bounds,
3587 mj_scalar_t *current_cut_lower_bounds,
3588 mj_scalar_t *current_global_left_closest_points,
3589 mj_scalar_t *current_global_right_closest_points,
3590 mj_scalar_t * current_cut_lower_bound_weights,
3591 mj_scalar_t * current_cut_upper_weights,
3592 mj_scalar_t *new_current_cut_coordinates,
3593 mj_scalar_t *current_part_cut_line_weight_to_put_left,
3594 mj_part_t *rectilinear_cut_count,
3595 mj_part_t &my_num_incomplete_cut){
3598 mj_scalar_t seen_weight_in_part = 0;
3600 mj_scalar_t expected_weight_in_part = 0;
3602 mj_scalar_t imbalance_on_left = 0, imbalance_on_right = 0;
3605 #ifdef HAVE_ZOLTAN2_OMP 3608 for (mj_part_t i = 0; i < num_cuts; i++){
3611 if(min_coordinate - current_global_left_closest_points[i] > this->sEpsilon)
3612 current_global_left_closest_points[i] = current_cut_coordinates[i];
3613 if(current_global_right_closest_points[i] - max_coordinate > this->sEpsilon)
3614 current_global_right_closest_points[i] = current_cut_coordinates[i];
3617 #ifdef HAVE_ZOLTAN2_OMP 3620 for (mj_part_t i = 0; i < num_cuts; i++){
3622 if(this->distribute_points_on_cut_lines){
3624 this->global_rectilinear_cut_weight[i] = 0;
3625 this->process_rectilinear_cut_weight[i] = 0;
3629 if(current_cut_line_determined[i]) {
3630 new_current_cut_coordinates[i] = current_cut_coordinates[i];
3635 seen_weight_in_part = current_global_part_weights[i * 2];
3644 expected_weight_in_part = current_part_target_weights[i];
3646 imbalance_on_left =
imbalanceOf2(seen_weight_in_part, expected_weight_in_part);
3648 imbalance_on_right =
imbalanceOf2(global_total_weight - seen_weight_in_part, global_total_weight - expected_weight_in_part);
3650 bool is_left_imbalance_valid =
ZOLTAN2_ABS(imbalance_on_left) - used_imbalance_tolerance < this->sEpsilon ;
3651 bool is_right_imbalance_valid =
ZOLTAN2_ABS(imbalance_on_right) - used_imbalance_tolerance < this->sEpsilon;
3654 if(is_left_imbalance_valid && is_right_imbalance_valid){
3655 current_cut_line_determined[i] =
true;
3656 #ifdef HAVE_ZOLTAN2_OMP 3659 my_num_incomplete_cut -= 1;
3660 new_current_cut_coordinates [i] = current_cut_coordinates[i];
3663 else if(imbalance_on_left < 0){
3666 if(this->distribute_points_on_cut_lines){
3671 if (current_global_part_weights[i * 2 + 1] == expected_weight_in_part){
3673 current_cut_line_determined[i] =
true;
3674 #ifdef HAVE_ZOLTAN2_OMP 3677 my_num_incomplete_cut -= 1;
3680 new_current_cut_coordinates [i] = current_cut_coordinates[i];
3684 current_part_cut_line_weight_to_put_left[i] = current_local_part_weights[i * 2 + 1] - current_local_part_weights[i * 2];
3687 else if (current_global_part_weights[i * 2 + 1] > expected_weight_in_part){
3691 current_cut_line_determined[i] =
true;
3692 #ifdef HAVE_ZOLTAN2_OMP 3695 *rectilinear_cut_count += 1;
3698 #ifdef HAVE_ZOLTAN2_OMP 3701 my_num_incomplete_cut -= 1;
3702 new_current_cut_coordinates [i] = current_cut_coordinates[i];
3703 this->process_rectilinear_cut_weight[i] = current_local_part_weights[i * 2 + 1] -
3704 current_local_part_weights[i * 2];
3709 current_cut_lower_bounds[i] = current_global_right_closest_points[i];
3711 current_cut_lower_bound_weights[i] = seen_weight_in_part;
3715 for (mj_part_t ii = i + 1; ii < num_cuts ; ++ii){
3716 mj_scalar_t p_weight = current_global_part_weights[ii * 2];
3717 mj_scalar_t line_weight = current_global_part_weights[ii * 2 + 1];
3719 if(p_weight >= expected_weight_in_part){
3724 if(p_weight == expected_weight_in_part){
3725 current_cut_upper_bounds[i] = current_cut_coordinates[ii];
3726 current_cut_upper_weights[i] = p_weight;
3727 current_cut_lower_bounds[i] = current_cut_coordinates[ii];
3728 current_cut_lower_bound_weights[i] = p_weight;
3729 }
else if (p_weight < current_cut_upper_weights[i]){
3732 current_cut_upper_bounds[i] = current_global_left_closest_points[ii];
3733 current_cut_upper_weights[i] = p_weight;
3739 if(line_weight >= expected_weight_in_part){
3742 current_cut_upper_bounds[i] = current_cut_coordinates[ii];
3743 current_cut_upper_weights[i] = line_weight;
3744 current_cut_lower_bounds[i] = current_cut_coordinates[ii];
3745 current_cut_lower_bound_weights[i] = p_weight;
3750 if (p_weight <= expected_weight_in_part && p_weight >= current_cut_lower_bound_weights[i]){
3751 current_cut_lower_bounds[i] = current_global_right_closest_points[ii] ;
3752 current_cut_lower_bound_weights[i] = p_weight;
3757 mj_scalar_t new_cut_position = 0;
3758 this->mj_calculate_new_cut_position(
3759 current_cut_upper_bounds[i],
3760 current_cut_lower_bounds[i],
3761 current_cut_upper_weights[i],
3762 current_cut_lower_bound_weights[i],
3763 expected_weight_in_part, new_cut_position);
3767 if (
ZOLTAN2_ABS(current_cut_coordinates[i] - new_cut_position) < this->sEpsilon
3770 current_cut_line_determined[i] =
true;
3771 #ifdef HAVE_ZOLTAN2_OMP 3774 my_num_incomplete_cut -= 1;
3777 new_current_cut_coordinates [i] = current_cut_coordinates[i];
3779 new_current_cut_coordinates [i] = new_cut_position;
3785 current_cut_upper_bounds[i] = current_global_left_closest_points[i];
3786 current_cut_upper_weights[i] = seen_weight_in_part;
3789 for (
int ii = i - 1; ii >= 0; --ii){
3790 mj_scalar_t p_weight = current_global_part_weights[ii * 2];
3791 mj_scalar_t line_weight = current_global_part_weights[ii * 2 + 1];
3792 if(p_weight <= expected_weight_in_part){
3793 if(p_weight == expected_weight_in_part){
3796 current_cut_upper_bounds[i] = current_cut_coordinates[ii];
3797 current_cut_upper_weights[i] = p_weight;
3798 current_cut_lower_bounds[i] = current_cut_coordinates[ii];
3799 current_cut_lower_bound_weights[i] = p_weight;
3801 else if (p_weight > current_cut_lower_bound_weights[i]){
3804 current_cut_lower_bounds[i] = current_global_right_closest_points[ii];
3805 current_cut_lower_bound_weights[i] = p_weight;
3811 if(line_weight > expected_weight_in_part){
3812 current_cut_upper_bounds[i] = current_global_right_closest_points[ii];
3813 current_cut_upper_weights[i] = line_weight;
3822 if (p_weight >= expected_weight_in_part &&
3823 (p_weight < current_cut_upper_weights[i] ||
3824 (p_weight == current_cut_upper_weights[i] &&
3825 current_cut_upper_bounds[i] > current_global_left_closest_points[ii]
3829 current_cut_upper_bounds[i] = current_global_left_closest_points[ii] ;
3830 current_cut_upper_weights[i] = p_weight;
3833 mj_scalar_t new_cut_position = 0;
3834 this->mj_calculate_new_cut_position(
3835 current_cut_upper_bounds[i],
3836 current_cut_lower_bounds[i],
3837 current_cut_upper_weights[i],
3838 current_cut_lower_bound_weights[i],
3839 expected_weight_in_part,
3843 if (
ZOLTAN2_ABS(current_cut_coordinates[i] - new_cut_position) < this->sEpsilon
3845 current_cut_line_determined[i] =
true;
3846 #ifdef HAVE_ZOLTAN2_OMP 3849 my_num_incomplete_cut -= 1;
3851 new_current_cut_coordinates [ i] = current_cut_coordinates[i];
3853 new_current_cut_coordinates [ i] = new_cut_position;
3862 #ifdef HAVE_ZOLTAN2_OMP 3867 if(*rectilinear_cut_count > 0){
3870 Teuchos::scan<int,mj_scalar_t>(
3871 *comm, Teuchos::REDUCE_SUM,
3873 this->process_rectilinear_cut_weight,
3874 this->global_rectilinear_cut_weight
3879 for (mj_part_t i = 0; i < num_cuts; ++i){
3881 if(this->global_rectilinear_cut_weight[i] > 0) {
3883 mj_scalar_t expected_part_weight = current_part_target_weights[i];
3885 mj_scalar_t necessary_weight_on_line_for_left = expected_part_weight - current_global_part_weights[i * 2];
3887 mj_scalar_t my_weight_on_line = this->process_rectilinear_cut_weight[i];
3889 mj_scalar_t weight_on_line_upto_process_inclusive = this->global_rectilinear_cut_weight[i];
3892 mj_scalar_t space_to_put_left = necessary_weight_on_line_for_left - weight_on_line_upto_process_inclusive;
3894 mj_scalar_t space_left_to_me = space_to_put_left + my_weight_on_line;
3904 if(space_left_to_me < 0){
3906 current_part_cut_line_weight_to_put_left[i] = 0;
3908 else if(space_left_to_me >= my_weight_on_line){
3911 current_part_cut_line_weight_to_put_left[i] = my_weight_on_line;
3916 current_part_cut_line_weight_to_put_left[i] = space_left_to_me ;
3923 *rectilinear_cut_count = 0;
3938 template <
typename mj_scalar_t,
typename mj_lno_t,
typename mj_gno_t,
3940 void AlgMJ<mj_scalar_t, mj_lno_t, mj_gno_t, mj_part_t>::get_processor_num_points_in_parts(
3941 mj_part_t num_procs,
3942 mj_part_t num_parts,
3943 mj_gno_t *&num_points_in_all_processor_parts){
3946 size_t allocation_size = num_parts * (num_procs + 1);
3953 mj_gno_t *num_local_points_in_each_part_to_reduce_sum = allocMemory<mj_gno_t>(allocation_size);
3958 mj_gno_t *my_local_points_to_reduce_sum = num_local_points_in_each_part_to_reduce_sum + num_procs * num_parts;
3961 mj_gno_t *my_local_point_counts_in_each_art = num_local_points_in_each_part_to_reduce_sum + this->myRank * num_parts;
3964 memset(num_local_points_in_each_part_to_reduce_sum, 0,
sizeof(mj_gno_t)*allocation_size);
3967 for (mj_part_t i = 0; i < num_parts; ++i){
3968 mj_lno_t part_begin_index = 0;
3970 part_begin_index = this->new_part_xadj[i - 1];
3972 mj_lno_t part_end_index = this->new_part_xadj[i];
3973 my_local_points_to_reduce_sum[i] = part_end_index - part_begin_index;
3978 memcpy (my_local_point_counts_in_each_art,
3979 my_local_points_to_reduce_sum,
3980 sizeof(mj_gno_t) * (num_parts) );
3988 reduceAll<int, mj_gno_t>(
3990 Teuchos::REDUCE_SUM,
3992 num_local_points_in_each_part_to_reduce_sum,
3993 num_points_in_all_processor_parts);
3996 freeArray<mj_gno_t>(num_local_points_in_each_part_to_reduce_sum);
4013 template <
typename mj_scalar_t,
typename mj_lno_t,
typename mj_gno_t,
4015 bool AlgMJ<mj_scalar_t, mj_lno_t, mj_gno_t, mj_part_t>::mj_check_to_migrate(
4016 size_t migration_reduce_all_population,
4017 mj_lno_t num_coords_for_last_dim_part,
4018 mj_part_t num_procs,
4019 mj_part_t num_parts,
4020 mj_gno_t *num_points_in_all_processor_parts){
4028 if (this->check_migrate_avoid_migration_option == 0){
4029 double global_imbalance = 0;
4031 size_t global_shift = num_procs * num_parts;
4033 for (mj_part_t ii = 0; ii < num_procs; ++ii){
4034 for (mj_part_t i = 0; i < num_parts; ++i){
4035 double ideal_num = num_points_in_all_processor_parts[global_shift + i]
4036 / double(num_procs);
4039 num_points_in_all_processor_parts[ii * num_parts + i]) / (ideal_num);
4042 global_imbalance /= num_parts;
4043 global_imbalance /= num_procs;
4051 if(global_imbalance <= this->minimum_migration_imbalance){
4074 template <
typename mj_scalar_t,
typename mj_lno_t,
typename mj_gno_t,
4076 void AlgMJ<mj_scalar_t, mj_lno_t, mj_gno_t, mj_part_t>::assign_send_destinations(
4077 mj_part_t num_parts,
4078 mj_part_t *part_assignment_proc_begin_indices,
4079 mj_part_t *processor_chains_in_parts,
4080 mj_lno_t *send_count_to_each_proc,
4081 int *coordinate_destinations){
4083 for (mj_part_t p = 0; p < num_parts; ++p){
4084 mj_lno_t part_begin = 0;
4085 if (p > 0) part_begin = this->new_part_xadj[p - 1];
4086 mj_lno_t part_end = this->new_part_xadj[p];
4089 mj_part_t proc_to_sent = part_assignment_proc_begin_indices[p];
4091 mj_lno_t num_total_send = 0;
4092 for (mj_lno_t j=part_begin; j < part_end; j++){
4093 mj_lno_t local_ind = this->new_coordinate_permutations[j];
4094 while (num_total_send >= send_count_to_each_proc[proc_to_sent]){
4098 part_assignment_proc_begin_indices[p] = processor_chains_in_parts[proc_to_sent];
4100 processor_chains_in_parts[proc_to_sent] = -1;
4102 proc_to_sent = part_assignment_proc_begin_indices[p];
4105 coordinate_destinations[local_ind] = proc_to_sent;
4125 template <
typename mj_scalar_t,
typename mj_lno_t,
typename mj_gno_t,
4127 void AlgMJ<mj_scalar_t, mj_lno_t, mj_gno_t, mj_part_t>::mj_assign_proc_to_parts(
4128 mj_gno_t * num_points_in_all_processor_parts,
4129 mj_part_t num_parts,
4130 mj_part_t num_procs,
4131 mj_lno_t *send_count_to_each_proc,
4132 std::vector<mj_part_t> &processor_ranks_for_subcomm,
4133 std::vector<mj_part_t> *next_future_num_parts_in_parts,
4134 mj_part_t &out_part_index,
4135 mj_part_t &output_part_numbering_begin_index,
4136 int *coordinate_destinations){
4139 mj_gno_t *global_num_points_in_parts = num_points_in_all_processor_parts + num_procs * num_parts;
4140 mj_part_t *num_procs_assigned_to_each_part = allocMemory<mj_part_t>(num_parts);
4143 bool did_i_find_my_group =
false;
4145 mj_part_t num_free_procs = num_procs;
4146 mj_part_t minimum_num_procs_required_for_rest_of_parts = num_parts - 1;
4148 double max_imbalance_difference = 0;
4149 mj_part_t max_differing_part = 0;
4152 for (mj_part_t i=0; i < num_parts; i++){
4155 double scalar_required_proc = num_procs *
4156 (double (global_num_points_in_parts[i]) / double (this->num_global_coords));
4159 mj_part_t required_proc =
static_cast<mj_part_t
> (0.5 + scalar_required_proc);
4163 if (num_free_procs - required_proc < minimum_num_procs_required_for_rest_of_parts){
4164 required_proc = num_free_procs - (minimum_num_procs_required_for_rest_of_parts);
4168 num_free_procs -= required_proc;
4170 --minimum_num_procs_required_for_rest_of_parts;
4173 num_procs_assigned_to_each_part[i] = required_proc;
4178 double imbalance_wrt_ideal = (scalar_required_proc - required_proc) / required_proc;
4179 if (imbalance_wrt_ideal > max_imbalance_difference){
4180 max_imbalance_difference = imbalance_wrt_ideal;
4181 max_differing_part = i;
4186 if (num_free_procs > 0){
4187 num_procs_assigned_to_each_part[max_differing_part] += num_free_procs;
4194 mj_part_t *part_assignment_proc_begin_indices = allocMemory<mj_part_t>(num_parts);
4196 mj_part_t *processor_chains_in_parts = allocMemory<mj_part_t>(num_procs);
4197 mj_part_t *processor_part_assignments = allocMemory<mj_part_t>(num_procs);
4206 for (
int i = 0; i < num_procs; ++i ){
4207 processor_part_assignments[i] = -1;
4208 processor_chains_in_parts[i] = -1;
4210 for (
int i = 0; i < num_parts; ++i ){
4211 part_assignment_proc_begin_indices[i] = -1;
4216 uSignedSortItem<mj_part_t, mj_gno_t, char> * sort_item_num_part_points_in_procs = allocMemory <uSignedSortItem<mj_part_t, mj_gno_t, char> > (num_procs);
4217 for(mj_part_t i = 0; i < num_parts; ++i){
4221 for(mj_part_t ii = 0; ii < num_procs; ++ii){
4222 sort_item_num_part_points_in_procs[ii].id = ii;
4225 if (processor_part_assignments[ii] == -1){
4226 sort_item_num_part_points_in_procs[ii].val = num_points_in_all_processor_parts[ii * num_parts + i];
4227 sort_item_num_part_points_in_procs[ii].signbit = 1;
4237 sort_item_num_part_points_in_procs[ii].val = num_points_in_all_processor_parts[ii * num_parts + i];
4238 sort_item_num_part_points_in_procs[ii].signbit = 0;
4242 uqSignsort<mj_part_t, mj_gno_t,char>(num_procs, sort_item_num_part_points_in_procs);
4252 mj_part_t required_proc_count = num_procs_assigned_to_each_part[i];
4253 mj_gno_t total_num_points_in_part = global_num_points_in_parts[i];
4254 mj_gno_t ideal_num_points_in_a_proc =
4255 Teuchos::as<mj_gno_t>(ceil (total_num_points_in_part /
double (required_proc_count)));
4258 mj_part_t next_proc_to_send_index = num_procs - required_proc_count;
4259 mj_part_t next_proc_to_send_id = sort_item_num_part_points_in_procs[next_proc_to_send_index].id;
4260 mj_lno_t space_left_in_sent_proc = ideal_num_points_in_a_proc - sort_item_num_part_points_in_procs[next_proc_to_send_index].val;
4264 for(mj_part_t ii = num_procs - 1; ii >= num_procs - required_proc_count; --ii){
4265 mj_part_t proc_id = sort_item_num_part_points_in_procs[ii].id;
4267 processor_part_assignments[proc_id] = i;
4270 bool did_change_sign =
false;
4272 for(mj_part_t ii = 0; ii < num_procs; ++ii){
4275 if (sort_item_num_part_points_in_procs[ii].signbit == 0){
4276 did_change_sign =
true;
4277 sort_item_num_part_points_in_procs[ii].signbit = 1;
4283 if(did_change_sign){
4285 uqSignsort<mj_part_t, mj_gno_t>(num_procs - required_proc_count, sort_item_num_part_points_in_procs);
4297 if (!did_i_find_my_group){
4298 for(mj_part_t ii = num_procs - 1; ii >= num_procs - required_proc_count; --ii){
4300 mj_part_t proc_id_to_assign = sort_item_num_part_points_in_procs[ii].id;
4302 processor_ranks_for_subcomm.push_back(proc_id_to_assign);
4304 if(proc_id_to_assign == this->myRank){
4306 did_i_find_my_group =
true;
4308 part_assignment_proc_begin_indices[i] = this->myRank;
4309 processor_chains_in_parts[this->myRank] = -1;
4312 send_count_to_each_proc[this->myRank] = sort_item_num_part_points_in_procs[ii].val;
4315 for (mj_part_t in = 0; in < i; ++in){
4316 output_part_numbering_begin_index += (*next_future_num_parts_in_parts)[in];
4323 if (!did_i_find_my_group){
4324 processor_ranks_for_subcomm.clear();
4332 for(mj_part_t ii = num_procs - required_proc_count - 1; ii >= 0; --ii){
4333 mj_part_t nonassigned_proc_id = sort_item_num_part_points_in_procs[ii].id;
4334 mj_lno_t num_points_to_sent = sort_item_num_part_points_in_procs[ii].val;
4339 if (num_points_to_sent < 0) {
4340 cout <<
"Migration - processor assignments - for part:" << i <<
"from proc:" << nonassigned_proc_id <<
" num_points_to_sent:" << num_points_to_sent << std::endl;
4346 while (num_points_to_sent > 0){
4348 if (num_points_to_sent <= space_left_in_sent_proc){
4350 space_left_in_sent_proc -= num_points_to_sent;
4352 if (this->myRank == nonassigned_proc_id){
4354 send_count_to_each_proc[next_proc_to_send_id] = num_points_to_sent;
4357 mj_part_t prev_begin = part_assignment_proc_begin_indices[i];
4358 part_assignment_proc_begin_indices[i] = next_proc_to_send_id;
4359 processor_chains_in_parts[next_proc_to_send_id] = prev_begin;
4361 num_points_to_sent = 0;
4365 if(space_left_in_sent_proc > 0){
4366 num_points_to_sent -= space_left_in_sent_proc;
4369 if (this->myRank == nonassigned_proc_id){
4371 send_count_to_each_proc[next_proc_to_send_id] = space_left_in_sent_proc;
4372 mj_part_t prev_begin = part_assignment_proc_begin_indices[i];
4373 part_assignment_proc_begin_indices[i] = next_proc_to_send_id;
4374 processor_chains_in_parts[next_proc_to_send_id] = prev_begin;
4379 ++next_proc_to_send_index;
4382 if(next_part_to_send_index < nprocs - required_proc_count ){
4383 cout <<
"Migration - processor assignments - for part:" 4385 <<
" next_part_to_send :" << next_part_to_send_index
4386 <<
" nprocs:" << nprocs
4387 <<
" required_proc_count:" << required_proc_count
4388 <<
" Error: next_part_to_send_index < nprocs - required_proc_count" << std::endl;
4394 next_proc_to_send_id = sort_item_num_part_points_in_procs[next_proc_to_send_index].id;
4396 space_left_in_sent_proc = ideal_num_points_in_a_proc - sort_item_num_part_points_in_procs[next_proc_to_send_index].val;
4404 this->assign_send_destinations(
4406 part_assignment_proc_begin_indices,
4407 processor_chains_in_parts,
4408 send_count_to_each_proc,
4409 coordinate_destinations);
4411 freeArray<mj_part_t>(part_assignment_proc_begin_indices);
4412 freeArray<mj_part_t>(processor_chains_in_parts);
4413 freeArray<mj_part_t>(processor_part_assignments);
4414 freeArray<uSignedSortItem<mj_part_t, mj_gno_t, char> > (sort_item_num_part_points_in_procs);
4415 freeArray<mj_part_t > (num_procs_assigned_to_each_part);
4432 template <
typename mj_scalar_t,
typename mj_lno_t,
typename mj_gno_t,
4434 void AlgMJ<mj_scalar_t, mj_lno_t, mj_gno_t, mj_part_t>::assign_send_destinations2(
4435 mj_part_t num_parts,
4436 uSortItem<mj_part_t, mj_part_t> * sort_item_part_to_proc_assignment,
4437 int *coordinate_destinations,
4438 mj_part_t &output_part_numbering_begin_index,
4439 std::vector<mj_part_t> *next_future_num_parts_in_parts){
4441 mj_part_t part_shift_amount = output_part_numbering_begin_index;
4442 mj_part_t previous_processor = -1;
4443 for(mj_part_t i = 0; i < num_parts; ++i){
4444 mj_part_t p = sort_item_part_to_proc_assignment[i].id;
4446 mj_lno_t part_begin_index = 0;
4447 if (p > 0) part_begin_index = this->new_part_xadj[p - 1];
4448 mj_lno_t part_end_index = this->new_part_xadj[p];
4450 mj_part_t assigned_proc = sort_item_part_to_proc_assignment[i].val;
4451 if (this->myRank == assigned_proc && previous_processor != assigned_proc){
4452 output_part_numbering_begin_index = part_shift_amount;
4454 previous_processor = assigned_proc;
4455 part_shift_amount += (*next_future_num_parts_in_parts)[p];
4457 for (mj_lno_t j=part_begin_index; j < part_end_index; j++){
4458 mj_lno_t localInd = this->new_coordinate_permutations[j];
4459 coordinate_destinations[localInd] = assigned_proc;
4481 template <
typename mj_scalar_t,
typename mj_lno_t,
typename mj_gno_t,
4483 void AlgMJ<mj_scalar_t, mj_lno_t, mj_gno_t, mj_part_t>::mj_assign_parts_to_procs(
4484 mj_gno_t * num_points_in_all_processor_parts,
4485 mj_part_t num_parts,
4486 mj_part_t num_procs,
4487 mj_lno_t *send_count_to_each_proc,
4488 std::vector<mj_part_t> *next_future_num_parts_in_parts,
4489 mj_part_t &out_num_part,
4490 std::vector<mj_part_t> &out_part_indices,
4491 mj_part_t &output_part_numbering_begin_index,
4492 int *coordinate_destinations){
4495 mj_gno_t *global_num_points_in_parts = num_points_in_all_processor_parts + num_procs * num_parts;
4496 out_part_indices.clear();
4500 uSortItem<mj_part_t, mj_part_t> * sort_item_part_to_proc_assignment = allocMemory <uSortItem<mj_part_t, mj_part_t> >(num_parts);
4501 uSortItem<mj_part_t, mj_gno_t> * sort_item_num_points_of_proc_in_part_i = allocMemory <uSortItem<mj_part_t, mj_gno_t> >(num_procs);
4505 mj_lno_t work_each = mj_lno_t (this->num_global_coords / (
double (num_procs)) + 0.5f);
4507 mj_lno_t *space_in_each_processor = allocMemory <mj_lno_t>(num_procs);
4509 for (mj_part_t i = 0; i < num_procs; ++i){
4510 space_in_each_processor[i] = work_each;
4517 mj_part_t *num_parts_proc_assigned = allocMemory <mj_part_t>(num_procs);
4518 memset(num_parts_proc_assigned, 0,
sizeof(mj_part_t) * num_procs);
4519 int empty_proc_count = num_procs;
4523 uSortItem<mj_part_t, mj_gno_t> * sort_item_point_counts_in_parts = allocMemory <uSortItem<mj_part_t, mj_gno_t> >(num_parts);
4527 for (mj_part_t i = 0; i < num_parts; ++i){
4528 sort_item_point_counts_in_parts[i].id = i;
4529 sort_item_point_counts_in_parts[i].val = global_num_points_in_parts[i];
4532 uqsort<mj_part_t, mj_gno_t>(num_parts, sort_item_point_counts_in_parts);
4538 for (mj_part_t j = 0; j < num_parts; ++j){
4540 mj_part_t i = sort_item_point_counts_in_parts[num_parts - 1 - j].id;
4542 mj_gno_t load = global_num_points_in_parts[i];
4545 mj_part_t assigned_proc = -1;
4547 mj_part_t best_proc_to_assign = 0;
4551 for (mj_part_t ii = 0; ii < num_procs; ++ii){
4552 sort_item_num_points_of_proc_in_part_i[ii].id = ii;
4557 if (empty_proc_count < num_parts - j || num_parts_proc_assigned[ii] == 0){
4559 sort_item_num_points_of_proc_in_part_i[ii].val = num_points_in_all_processor_parts[ii * num_parts + i];
4562 sort_item_num_points_of_proc_in_part_i[ii].val = -1;
4565 uqsort<mj_part_t, mj_gno_t>(num_procs, sort_item_num_points_of_proc_in_part_i);
4568 for (mj_part_t iii = num_procs - 1; iii >= 0; --iii){
4569 mj_part_t ii = sort_item_num_points_of_proc_in_part_i[iii].id;
4570 mj_lno_t left_space = space_in_each_processor[ii] - load;
4572 if(left_space >= 0 ){
4577 if (space_in_each_processor[best_proc_to_assign] < space_in_each_processor[ii]){
4578 best_proc_to_assign = ii;
4583 if (assigned_proc == -1){
4584 assigned_proc = best_proc_to_assign;
4587 if (num_parts_proc_assigned[assigned_proc]++ == 0){
4590 space_in_each_processor[assigned_proc] -= load;
4592 sort_item_part_to_proc_assignment[j].id = i;
4593 sort_item_part_to_proc_assignment[j].val = assigned_proc;
4597 if (assigned_proc == this->myRank){
4599 out_part_indices.push_back(i);
4603 send_count_to_each_proc[assigned_proc] += num_points_in_all_processor_parts[this->myRank * num_parts + i];
4605 freeArray<mj_part_t>(num_parts_proc_assigned);
4606 freeArray< uSortItem<mj_part_t, mj_gno_t> > (sort_item_num_points_of_proc_in_part_i);
4607 freeArray<uSortItem<mj_part_t, mj_gno_t> >(sort_item_point_counts_in_parts);
4608 freeArray<mj_lno_t >(space_in_each_processor);
4612 uqsort<mj_part_t, mj_part_t>(num_parts, sort_item_part_to_proc_assignment);
4616 this->assign_send_destinations2(
4618 sort_item_part_to_proc_assignment,
4619 coordinate_destinations,
4620 output_part_numbering_begin_index,
4621 next_future_num_parts_in_parts);
4623 freeArray<uSortItem<mj_part_t, mj_part_t> >(sort_item_part_to_proc_assignment);
4644 template <
typename mj_scalar_t,
typename mj_lno_t,
typename mj_gno_t,
4646 void AlgMJ<mj_scalar_t, mj_lno_t, mj_gno_t, mj_part_t>::mj_migration_part_proc_assignment(
4647 mj_gno_t * num_points_in_all_processor_parts,
4648 mj_part_t num_parts,
4649 mj_part_t num_procs,
4650 mj_lno_t *send_count_to_each_proc,
4651 std::vector<mj_part_t> &processor_ranks_for_subcomm,
4652 std::vector<mj_part_t> *next_future_num_parts_in_parts,
4653 mj_part_t &out_num_part,
4654 std::vector<mj_part_t> &out_part_indices,
4655 mj_part_t &output_part_numbering_begin_index,
4656 int *coordinate_destinations){
4660 processor_ranks_for_subcomm.clear();
4662 if (num_procs > num_parts){
4667 mj_part_t out_part_index = 0;
4668 this->mj_assign_proc_to_parts(
4669 num_points_in_all_processor_parts,
4672 send_count_to_each_proc,
4673 processor_ranks_for_subcomm,
4674 next_future_num_parts_in_parts,
4676 output_part_numbering_begin_index,
4677 coordinate_destinations
4681 out_part_indices.clear();
4682 out_part_indices.push_back(out_part_index);
4689 processor_ranks_for_subcomm.push_back(this->myRank);
4693 this->mj_assign_parts_to_procs(
4694 num_points_in_all_processor_parts,
4697 send_count_to_each_proc,
4698 next_future_num_parts_in_parts,
4701 output_part_numbering_begin_index,
4702 coordinate_destinations);
4718 template <
typename mj_scalar_t,
typename mj_lno_t,
typename mj_gno_t,
4720 void AlgMJ<mj_scalar_t, mj_lno_t, mj_gno_t, mj_part_t>::mj_migrate_coords(
4721 mj_part_t num_procs,
4722 mj_lno_t &num_new_local_points,
4723 std::string iteration,
4724 int *coordinate_destinations,
4725 mj_part_t num_parts)
4727 #ifdef ENABLE_ZOLTAN_MIGRATION 4728 if (
sizeof(mj_lno_t) <=
sizeof(
int)) {
4734 ZOLTAN_COMM_OBJ *plan = NULL;
4735 MPI_Comm mpi_comm = Teuchos::getRawMpiComm(*(this->comm));
4736 int num_incoming_gnos = 0;
4737 int message_tag = 7859;
4739 this->mj_env->timerStart(
MACRO_TIMERS,
"MultiJagged - Migration Z1PlanCreating-" + iteration);
4740 int ierr = Zoltan_Comm_Create(
4742 int(this->num_local_coords),
4743 coordinate_destinations,
4746 &num_incoming_gnos);
4748 this->mj_env->timerStop(
MACRO_TIMERS,
"MultiJagged - Migration Z1PlanCreating-" + iteration);
4750 this->mj_env->timerStart(
MACRO_TIMERS,
"MultiJagged - Migration Z1Migration-" + iteration);
4751 mj_gno_t *incoming_gnos = allocMemory< mj_gno_t>(num_incoming_gnos);
4755 ierr = Zoltan_Comm_Do(
4758 (
char *) this->current_mj_gnos,
4760 (
char *) incoming_gnos);
4763 freeArray<mj_gno_t>(this->current_mj_gnos);
4764 this->current_mj_gnos = incoming_gnos;
4768 for (
int i = 0; i < this->coord_dim; ++i){
4770 mj_scalar_t *coord = this->mj_coordinates[i];
4772 this->mj_coordinates[i] = allocMemory<mj_scalar_t>(num_incoming_gnos);
4773 ierr = Zoltan_Comm_Do(
4777 sizeof(mj_scalar_t),
4778 (
char *) this->mj_coordinates[i]);
4780 freeArray<mj_scalar_t>(coord);
4784 for (
int i = 0; i < this->num_weights_per_coord; ++i){
4786 mj_scalar_t *weight = this->mj_weights[i];
4788 this->mj_weights[i] = allocMemory<mj_scalar_t>(num_incoming_gnos);
4789 ierr = Zoltan_Comm_Do(
4793 sizeof(mj_scalar_t),
4794 (
char *) this->mj_weights[i]);
4796 freeArray<mj_scalar_t>(weight);
4801 int *coord_own = allocMemory<int>(num_incoming_gnos);
4803 ierr = Zoltan_Comm_Do(
4806 (
char *) this->owner_of_coordinate,
4807 sizeof(
int), (
char *) coord_own);
4809 freeArray<int>(this->owner_of_coordinate);
4810 this->owner_of_coordinate = coord_own;
4816 mj_part_t *new_parts = allocMemory<mj_part_t>(num_incoming_gnos);
4817 if(num_procs < num_parts){
4819 ierr = Zoltan_Comm_Do(
4822 (
char *) this->assigned_part_ids,
4824 (
char *) new_parts);
4827 freeArray<mj_part_t>(this->assigned_part_ids);
4828 this->assigned_part_ids = new_parts;
4830 ierr = Zoltan_Comm_Destroy(&plan);
4832 num_new_local_points = num_incoming_gnos;
4833 this->mj_env->timerStop(
MACRO_TIMERS,
"MultiJagged - Migration Z1Migration-" + iteration);
4838 #endif // ENABLE_ZOLTAN_MIGRATION 4840 this->mj_env->timerStart(
MACRO_TIMERS,
"MultiJagged - Migration DistributorPlanCreating-" + iteration);
4841 Tpetra::Distributor distributor(this->comm);
4842 ArrayView<const mj_part_t> destinations( coordinate_destinations, this->num_local_coords);
4843 mj_lno_t num_incoming_gnos = distributor.createFromSends(destinations);
4844 this->mj_env->timerStop(
MACRO_TIMERS,
"MultiJagged - Migration DistributorPlanCreating-" + iteration);
4846 this->mj_env->timerStart(
MACRO_TIMERS,
"MultiJagged - Migration DistributorMigration-" + iteration);
4849 ArrayRCP<mj_gno_t> received_gnos(num_incoming_gnos);
4850 ArrayView<mj_gno_t> sent_gnos(this->current_mj_gnos, this->num_local_coords);
4851 distributor.doPostsAndWaits<mj_gno_t>(sent_gnos, 1, received_gnos());
4852 freeArray<mj_gno_t>(this->current_mj_gnos);
4853 this->current_mj_gnos = allocMemory<mj_gno_t>(num_incoming_gnos);
4855 this->current_mj_gnos,
4856 received_gnos.getRawPtr(),
4857 num_incoming_gnos *
sizeof(mj_gno_t));
4860 for (
int i = 0; i < this->coord_dim; ++i){
4862 ArrayView<mj_scalar_t> sent_coord(this->mj_coordinates[i], this->num_local_coords);
4863 ArrayRCP<mj_scalar_t> received_coord(num_incoming_gnos);
4864 distributor.doPostsAndWaits<mj_scalar_t>(sent_coord, 1, received_coord());
4865 freeArray<mj_scalar_t>(this->mj_coordinates[i]);
4866 this->mj_coordinates[i] = allocMemory<mj_scalar_t>(num_incoming_gnos);
4868 this->mj_coordinates[i],
4869 received_coord.getRawPtr(),
4870 num_incoming_gnos *
sizeof(mj_scalar_t));
4874 for (
int i = 0; i < this->num_weights_per_coord; ++i){
4876 ArrayView<mj_scalar_t> sent_weight(this->mj_weights[i], this->num_local_coords);
4877 ArrayRCP<mj_scalar_t> received_weight(num_incoming_gnos);
4878 distributor.doPostsAndWaits<mj_scalar_t>(sent_weight, 1, received_weight());
4879 freeArray<mj_scalar_t>(this->mj_weights[i]);
4880 this->mj_weights[i] = allocMemory<mj_scalar_t>(num_incoming_gnos);
4882 this->mj_weights[i],
4883 received_weight.getRawPtr(),
4884 num_incoming_gnos *
sizeof(mj_scalar_t));
4889 ArrayView<int> sent_owners(this->owner_of_coordinate, this->num_local_coords);
4890 ArrayRCP<int> received_owners(num_incoming_gnos);
4891 distributor.doPostsAndWaits<
int>(sent_owners, 1, received_owners());
4892 freeArray<int>(this->owner_of_coordinate);
4893 this->owner_of_coordinate = allocMemory<int>(num_incoming_gnos);
4895 this->owner_of_coordinate,
4896 received_owners.getRawPtr(),
4897 num_incoming_gnos *
sizeof(int));
4903 if(num_procs < num_parts){
4904 ArrayView<mj_part_t> sent_partids(this->assigned_part_ids, this->num_local_coords);
4905 ArrayRCP<mj_part_t> received_partids(num_incoming_gnos);
4906 distributor.doPostsAndWaits<mj_part_t>(sent_partids, 1, received_partids());
4907 freeArray<mj_part_t>(this->assigned_part_ids);
4908 this->assigned_part_ids = allocMemory<mj_part_t>(num_incoming_gnos);
4910 this->assigned_part_ids,
4911 received_partids.getRawPtr(),
4912 num_incoming_gnos *
sizeof(mj_part_t));
4915 mj_part_t *new_parts = allocMemory<int>(num_incoming_gnos);
4916 freeArray<mj_part_t>(this->assigned_part_ids);
4917 this->assigned_part_ids = new_parts;
4919 this->mj_env->timerStop(
MACRO_TIMERS,
"MultiJagged - Migration DistributorMigration-" + iteration);
4920 num_new_local_points = num_incoming_gnos;
4931 template <
typename mj_scalar_t,
typename mj_lno_t,
typename mj_gno_t,
4933 void AlgMJ<mj_scalar_t, mj_lno_t, mj_gno_t, mj_part_t>::create_sub_communicator(std::vector<mj_part_t> &processor_ranks_for_subcomm){
4934 mj_part_t group_size = processor_ranks_for_subcomm.size();
4935 mj_part_t *ids = allocMemory<mj_part_t>(group_size);
4936 for(mj_part_t i = 0; i < group_size; ++i) {
4937 ids[i] = processor_ranks_for_subcomm[i];
4939 ArrayView<const mj_part_t> idView(ids, group_size);
4940 this->comm = this->comm->createSubcommunicator(idView);
4941 freeArray<mj_part_t>(ids);
4950 template <
typename mj_scalar_t,
typename mj_lno_t,
typename mj_gno_t,
4952 void AlgMJ<mj_scalar_t, mj_lno_t, mj_gno_t, mj_part_t>::fill_permutation_array(
4953 mj_part_t output_num_parts,
4954 mj_part_t num_parts){
4956 if (output_num_parts == 1){
4957 for(mj_lno_t i = 0; i < this->num_local_coords; ++i){
4958 this->new_coordinate_permutations[i] = i;
4960 this->new_part_xadj[0] = this->num_local_coords;
4967 mj_lno_t *num_points_in_parts = allocMemory<mj_lno_t>(num_parts);
4969 mj_part_t *part_shifts = allocMemory<mj_part_t>(num_parts);
4971 memset(num_points_in_parts, 0,
sizeof(mj_lno_t) * num_parts);
4973 for(mj_lno_t i = 0; i < this->num_local_coords; ++i){
4974 mj_part_t ii = this->assigned_part_ids[i];
4975 ++num_points_in_parts[ii];
4980 mj_lno_t prev_index = 0;
4981 for(mj_part_t i = 0; i < num_parts; ++i){
4982 if(num_points_in_parts[i] > 0) {
4983 this->new_part_xadj[p] = prev_index + num_points_in_parts[i];
4984 prev_index += num_points_in_parts[i];
4985 part_shifts[i] = p++;
4990 mj_part_t assigned_num_parts = p - 1;
4991 for (;p < num_parts; ++p){
4992 this->new_part_xadj[p] = this->new_part_xadj[assigned_num_parts];
4994 for(mj_part_t i = 0; i < output_num_parts; ++i){
4995 num_points_in_parts[i] = this->new_part_xadj[i];
5001 for(mj_lno_t i = this->num_local_coords - 1; i >= 0; --i){
5002 mj_part_t part = part_shifts[mj_part_t(this->assigned_part_ids[i])];
5003 this->new_coordinate_permutations[--num_points_in_parts[part]] = i;
5006 freeArray<mj_lno_t>(num_points_in_parts);
5007 freeArray<mj_part_t>(part_shifts);
5034 template <
typename mj_scalar_t,
typename mj_lno_t,
typename mj_gno_t,
5036 bool AlgMJ<mj_scalar_t, mj_lno_t, mj_gno_t, mj_part_t>::mj_perform_migration(
5037 mj_part_t input_num_parts,
5038 mj_part_t &output_num_parts,
5039 std::vector<mj_part_t> *next_future_num_parts_in_parts,
5040 mj_part_t &output_part_begin_index,
5041 size_t migration_reduce_all_population,
5042 mj_lno_t num_coords_for_last_dim_part,
5043 std::string iteration,
5044 RCP<mj_partBoxVector_t> &input_part_boxes,
5045 RCP<mj_partBoxVector_t> &output_part_boxes
5048 mj_part_t num_procs = this->comm->getSize();
5049 this->myRank = this->comm->getRank();
5055 mj_gno_t *num_points_in_all_processor_parts = allocMemory<mj_gno_t>(input_num_parts * (num_procs + 1));
5058 this->get_processor_num_points_in_parts(
5061 num_points_in_all_processor_parts);
5065 if (!this->mj_check_to_migrate(
5066 migration_reduce_all_population,
5067 num_coords_for_last_dim_part,
5070 num_points_in_all_processor_parts)){
5071 freeArray<mj_gno_t>(num_points_in_all_processor_parts);
5076 mj_lno_t *send_count_to_each_proc = NULL;
5077 int *coordinate_destinations = allocMemory<int>(this->num_local_coords);
5078 send_count_to_each_proc = allocMemory<mj_lno_t>(num_procs);
5079 for (
int i = 0; i < num_procs; ++i) send_count_to_each_proc[i] = 0;
5081 std::vector<mj_part_t> processor_ranks_for_subcomm;
5082 std::vector<mj_part_t> out_part_indices;
5085 this->mj_migration_part_proc_assignment(
5086 num_points_in_all_processor_parts,
5089 send_count_to_each_proc,
5090 processor_ranks_for_subcomm,
5091 next_future_num_parts_in_parts,
5094 output_part_begin_index,
5095 coordinate_destinations);
5100 freeArray<mj_lno_t>(send_count_to_each_proc);
5101 std::vector <mj_part_t> tmpv;
5103 std::sort (out_part_indices.begin(), out_part_indices.end());
5104 mj_part_t outP = out_part_indices.size();
5106 mj_gno_t new_global_num_points = 0;
5107 mj_gno_t *global_num_points_in_parts = num_points_in_all_processor_parts + num_procs * input_num_parts;
5109 if (this->mj_keep_part_boxes){
5110 input_part_boxes->clear();
5115 for (mj_part_t i = 0; i < outP; ++i){
5116 mj_part_t ind = out_part_indices[i];
5117 new_global_num_points += global_num_points_in_parts[ind];
5118 tmpv.push_back((*next_future_num_parts_in_parts)[ind]);
5119 if (this->mj_keep_part_boxes){
5120 input_part_boxes->push_back((*output_part_boxes)[ind]);
5124 if (this->mj_keep_part_boxes){
5125 RCP<mj_partBoxVector_t> tmpPartBoxes = input_part_boxes;
5126 input_part_boxes = output_part_boxes;
5127 output_part_boxes = tmpPartBoxes;
5129 next_future_num_parts_in_parts->clear();
5130 for (mj_part_t i = 0; i < outP; ++i){
5131 mj_part_t p = tmpv[i];
5132 next_future_num_parts_in_parts->push_back(p);
5135 freeArray<mj_gno_t>(num_points_in_all_processor_parts);
5137 mj_lno_t num_new_local_points = 0;
5141 this->mj_migrate_coords(
5143 num_new_local_points,
5145 coordinate_destinations,
5149 freeArray<int>(coordinate_destinations);
5151 if(this->num_local_coords != num_new_local_points){
5152 freeArray<mj_lno_t>(this->new_coordinate_permutations);
5153 freeArray<mj_lno_t>(this->coordinate_permutations);
5155 this->new_coordinate_permutations = allocMemory<mj_lno_t>(num_new_local_points);
5156 this->coordinate_permutations = allocMemory<mj_lno_t>(num_new_local_points);
5158 this->num_local_coords = num_new_local_points;
5159 this->num_global_coords = new_global_num_points;
5164 this->create_sub_communicator(processor_ranks_for_subcomm);
5165 processor_ranks_for_subcomm.clear();
5168 this->fill_permutation_array(
5188 template <
typename mj_scalar_t,
typename mj_lno_t,
typename mj_gno_t,
5190 void AlgMJ<mj_scalar_t, mj_lno_t, mj_gno_t, mj_part_t>::create_consistent_chunks(
5191 mj_part_t num_parts,
5192 mj_scalar_t *mj_current_dim_coords,
5193 mj_scalar_t *current_concurrent_cut_coordinate,
5194 mj_lno_t coordinate_begin,
5195 mj_lno_t coordinate_end,
5196 mj_scalar_t *used_local_cut_line_weight_to_left,
5197 mj_lno_t *out_part_xadj,
5201 mj_part_t no_cuts = num_parts - 1;
5206 mj_lno_t *thread_num_points_in_parts = this->thread_point_counts[me];
5207 mj_scalar_t *my_local_thread_cut_weights_to_put_left = NULL;
5212 if (this->distribute_points_on_cut_lines){
5214 my_local_thread_cut_weights_to_put_left = this->thread_cut_line_weight_to_put_left[me];
5215 for (mj_part_t i = 0; i < no_cuts; ++i){
5217 mj_scalar_t left_weight = used_local_cut_line_weight_to_left[i];
5219 for(
int ii = 0; ii < this->num_threads; ++ii){
5220 if(left_weight > this->sEpsilon){
5222 mj_scalar_t thread_ii_weight_on_cut = this->thread_part_weight_work[ii][i * 2 + 1] - this->thread_part_weight_work[ii][i * 2 ];
5223 if(thread_ii_weight_on_cut < left_weight){
5224 this->thread_cut_line_weight_to_put_left[ii][i] = thread_ii_weight_on_cut;
5227 this->thread_cut_line_weight_to_put_left[ii][i] = left_weight ;
5229 left_weight -= thread_ii_weight_on_cut;
5232 this->thread_cut_line_weight_to_put_left[ii][i] = 0;
5240 for (mj_part_t i = no_cuts - 1; i > 0 ; --i){
5241 if(
ZOLTAN2_ABS(current_concurrent_cut_coordinate[i] - current_concurrent_cut_coordinate[i -1]) < this->sEpsilon){
5242 my_local_thread_cut_weights_to_put_left[i] -= my_local_thread_cut_weights_to_put_left[i - 1] ;
5250 for(mj_part_t ii = 0; ii < num_parts; ++ii){
5251 thread_num_points_in_parts[ii] = 0;
5263 mj_part_t *cut_map = allocMemory<mj_part_t> (no_cuts);
5266 typedef uMultiSortItem<mj_lno_t, int, mj_scalar_t> multiSItem;
5267 typedef std::vector< multiSItem > multiSVector;
5268 typedef std::vector<multiSVector> multiS2Vector;
5271 std::vector<mj_scalar_t *>allocated_memory;
5274 multiS2Vector sort_vector_points_on_cut;
5277 mj_part_t different_cut_count = 1;
5282 multiSVector tmpMultiSVector;
5283 sort_vector_points_on_cut.push_back(tmpMultiSVector);
5285 for (mj_part_t i = 1; i < no_cuts ; ++i){
5288 if(
ZOLTAN2_ABS(current_concurrent_cut_coordinate[i] - current_concurrent_cut_coordinate[i -1]) < this->sEpsilon){
5289 cut_map[i] = cut_map[i-1];
5292 cut_map[i] = different_cut_count++;
5293 multiSVector tmp2MultiSVector;
5294 sort_vector_points_on_cut.push_back(tmp2MultiSVector);
5300 for (mj_lno_t ii = coordinate_begin; ii < coordinate_end; ++ii){
5302 mj_lno_t i = this->coordinate_permutations[ii];
5304 mj_part_t pp = this->assigned_part_ids[i];
5305 mj_part_t p = pp / 2;
5308 mj_scalar_t *
vals = allocMemory<mj_scalar_t>(this->coord_dim -1);
5309 allocated_memory.push_back(
vals);
5313 for(
int dim = coordInd + 1; dim < this->coord_dim; ++dim){
5314 vals[val_ind++] = this->mj_coordinates[dim][i];
5316 for(
int dim = 0; dim < coordInd; ++dim){
5317 vals[val_ind++] = this->mj_coordinates[dim][i];
5319 multiSItem tempSortItem(i, this->coord_dim -1,
vals);
5321 mj_part_t cmap = cut_map[p];
5322 sort_vector_points_on_cut[cmap].push_back(tempSortItem);
5326 ++thread_num_points_in_parts[p];
5327 this->assigned_part_ids[i] = p;
5332 for (mj_part_t i = 0; i < different_cut_count; ++i){
5333 std::sort (sort_vector_points_on_cut[i].begin(), sort_vector_points_on_cut[i].end());
5337 mj_part_t previous_cut_map = cut_map[0];
5346 mj_scalar_t weight_stolen_from_previous_part = 0;
5347 for (mj_part_t p = 0; p < no_cuts; ++p){
5349 mj_part_t mapped_cut = cut_map[p];
5353 if (previous_cut_map != mapped_cut){
5354 mj_lno_t sort_vector_end = (mj_lno_t)sort_vector_points_on_cut[previous_cut_map].size() - 1;
5355 for (; sort_vector_end >= 0; --sort_vector_end){
5356 multiSItem t = sort_vector_points_on_cut[previous_cut_map][sort_vector_end];
5357 mj_lno_t i = t.index;
5358 ++thread_num_points_in_parts[p];
5359 this->assigned_part_ids[i] = p;
5361 sort_vector_points_on_cut[previous_cut_map].clear();
5363 mj_lno_t sort_vector_end = (mj_lno_t)sort_vector_points_on_cut[mapped_cut].size() - 1;
5365 for (; sort_vector_end >= 0; --sort_vector_end){
5366 multiSItem t = sort_vector_points_on_cut[mapped_cut][sort_vector_end];
5367 mj_lno_t i = t.index;
5368 mj_scalar_t w = this->mj_uniform_weights[0]? 1:this->mj_weights[0][i];
5372 if( my_local_thread_cut_weights_to_put_left[p] + weight_stolen_from_previous_part> this->sEpsilon &&
5373 my_local_thread_cut_weights_to_put_left[p] + weight_stolen_from_previous_part -
ZOLTAN2_ABS(my_local_thread_cut_weights_to_put_left[p] + weight_stolen_from_previous_part - w)
5376 my_local_thread_cut_weights_to_put_left[p] -= w;
5377 sort_vector_points_on_cut[mapped_cut].pop_back();
5378 ++thread_num_points_in_parts[p];
5379 this->assigned_part_ids[i] = p;
5382 if(p < no_cuts - 1 && my_local_thread_cut_weights_to_put_left[p] < this->sEpsilon){
5383 if(mapped_cut == cut_map[p + 1] ){
5386 if (previous_cut_map != mapped_cut){
5387 weight_stolen_from_previous_part = my_local_thread_cut_weights_to_put_left[p];
5392 weight_stolen_from_previous_part += my_local_thread_cut_weights_to_put_left[p];
5396 weight_stolen_from_previous_part = -my_local_thread_cut_weights_to_put_left[p];
5405 if(p < no_cuts - 1 && mapped_cut == cut_map[p + 1]){
5406 if (previous_cut_map != mapped_cut){
5407 weight_stolen_from_previous_part = my_local_thread_cut_weights_to_put_left[p];
5410 weight_stolen_from_previous_part += my_local_thread_cut_weights_to_put_left[p];
5414 weight_stolen_from_previous_part = -my_local_thread_cut_weights_to_put_left[p];
5420 previous_cut_map = mapped_cut;
5423 mj_lno_t sort_vector_end = (mj_lno_t)sort_vector_points_on_cut[previous_cut_map].size() - 1;
5424 for (; sort_vector_end >= 0; --sort_vector_end){
5425 multiSItem t = sort_vector_points_on_cut[previous_cut_map][sort_vector_end];
5426 mj_lno_t i = t.index;
5427 ++thread_num_points_in_parts[no_cuts];
5428 this->assigned_part_ids[i] = no_cuts;
5430 sort_vector_points_on_cut[previous_cut_map].clear();
5431 freeArray<mj_part_t> (cut_map);
5434 mj_lno_t vSize = (mj_lno_t) allocated_memory.size();
5435 for(mj_lno_t i = 0; i < vSize; ++i){
5436 freeArray<mj_scalar_t> (allocated_memory[i]);
5440 for(mj_part_t j = 0; j < num_parts; ++j){
5441 mj_lno_t num_points_in_part_j_upto_thread_i = 0;
5442 for (
int i = 0; i < this->num_threads; ++i){
5443 mj_lno_t thread_num_points_in_part_j = this->thread_point_counts[i][j];
5444 this->thread_point_counts[i][j] = num_points_in_part_j_upto_thread_i;
5445 num_points_in_part_j_upto_thread_i += thread_num_points_in_part_j;
5448 out_part_xadj[j] = num_points_in_part_j_upto_thread_i;
5452 for(mj_part_t j = 1; j < num_parts; ++j){
5453 out_part_xadj[j] += out_part_xadj[j - 1];
5459 for(mj_part_t j = 1; j < num_parts; ++j){
5460 thread_num_points_in_parts[j] += out_part_xadj[j - 1] ;
5465 for (mj_lno_t ii = coordinate_begin; ii < coordinate_end; ++ii){
5466 mj_lno_t i = this->coordinate_permutations[ii];
5467 mj_part_t p = this->assigned_part_ids[i];
5468 this->new_coordinate_permutations[coordinate_begin +
5469 thread_num_points_in_parts[p]++] = i;
5484 template <
typename mj_scalar_t,
typename mj_lno_t,
typename mj_gno_t,
5486 void AlgMJ<mj_scalar_t, mj_lno_t, mj_gno_t, mj_part_t>::set_final_parts(
5487 mj_part_t current_num_parts,
5488 mj_part_t output_part_begin_index,
5489 RCP<mj_partBoxVector_t> &output_part_boxes,
5490 bool is_data_ever_migrated)
5492 this->mj_env->timerStart(
MACRO_TIMERS,
"MultiJagged - Part_Assignment");
5494 #ifdef HAVE_ZOLTAN2_OMP 5495 #pragma omp parallel for 5497 for(mj_part_t i = 0; i < current_num_parts;++i){
5500 mj_lno_t end = this->part_xadj[i];
5502 if(i > 0) begin = this->part_xadj[i -1];
5503 mj_part_t part_to_set_index = i + output_part_begin_index;
5504 if (this->mj_keep_part_boxes){
5505 (*output_part_boxes)[i].setpId(part_to_set_index);
5507 for (mj_lno_t ii = begin; ii < end; ++ii){
5508 mj_lno_t k = this->coordinate_permutations[ii];
5509 this->assigned_part_ids[k] = part_to_set_index;
5514 if(!is_data_ever_migrated){
5521 #ifdef ENABLE_ZOLTAN_MIGRATION 5522 if (
sizeof(mj_lno_t) <=
sizeof(
int)) {
5529 ZOLTAN_COMM_OBJ *plan = NULL;
5530 MPI_Comm mpi_comm = Teuchos::getRawMpiComm(*(this->mj_problemComm));
5533 int message_tag = 7856;
5535 this->mj_env->timerStart(
MACRO_TIMERS,
"MultiJagged - Final Z1PlanCreating");
5536 int ierr = Zoltan_Comm_Create( &plan,
int(this->num_local_coords),
5537 this->owner_of_coordinate, mpi_comm, message_tag,
5540 this->mj_env->timerStop(
MACRO_TIMERS,
"MultiJagged - Final Z1PlanCreating" );
5542 mj_gno_t *incoming_gnos = allocMemory< mj_gno_t>(incoming);
5545 this->mj_env->timerStart(
MACRO_TIMERS,
"MultiJagged - Final Z1PlanComm");
5546 ierr = Zoltan_Comm_Do( plan, message_tag, (
char *) this->current_mj_gnos,
5547 sizeof(mj_gno_t), (
char *) incoming_gnos);
5550 freeArray<mj_gno_t>(this->current_mj_gnos);
5551 this->current_mj_gnos = incoming_gnos;
5553 mj_part_t *incoming_partIds = allocMemory< mj_part_t>(incoming);
5556 ierr = Zoltan_Comm_Do( plan, message_tag, (
char *) this->assigned_part_ids,
5557 sizeof(mj_part_t), (
char *) incoming_partIds);
5559 freeArray<mj_part_t>(this->assigned_part_ids);
5560 this->assigned_part_ids = incoming_partIds;
5562 this->mj_env->timerStop(
MACRO_TIMERS,
"MultiJagged - Final Z1PlanComm");
5563 ierr = Zoltan_Comm_Destroy(&plan);
5566 this->num_local_coords = incoming;
5571 #endif // !ENABLE_ZOLTAN_MIGRATION 5574 this->mj_env->timerStart(
MACRO_TIMERS,
"MultiJagged - Final DistributorPlanCreating");
5575 Tpetra::Distributor distributor(this->mj_problemComm);
5576 ArrayView<const mj_part_t> owners_of_coords(this->owner_of_coordinate, this->num_local_coords);
5577 mj_lno_t incoming = distributor.createFromSends(owners_of_coords);
5578 this->mj_env->timerStop(
MACRO_TIMERS,
"MultiJagged - Final DistributorPlanCreating" );
5580 this->mj_env->timerStart(
MACRO_TIMERS,
"MultiJagged - Final DistributorPlanComm");
5582 ArrayRCP<mj_gno_t> received_gnos(incoming);
5583 ArrayView<mj_gno_t> sent_gnos(this->current_mj_gnos, this->num_local_coords);
5584 distributor.doPostsAndWaits<mj_gno_t>(sent_gnos, 1, received_gnos());
5585 freeArray<mj_gno_t>(this->current_mj_gnos);
5586 this->current_mj_gnos = allocMemory<mj_gno_t>(incoming);
5587 memcpy( this->current_mj_gnos,
5588 received_gnos.getRawPtr(),
5589 incoming *
sizeof(mj_gno_t));
5592 ArrayView<mj_part_t> sent_partids(this->assigned_part_ids, this->num_local_coords);
5593 ArrayRCP<mj_part_t> received_partids(incoming);
5594 distributor.doPostsAndWaits<mj_part_t>(sent_partids, 1, received_partids());
5595 freeArray<mj_part_t>(this->assigned_part_ids);
5596 this->assigned_part_ids = allocMemory<mj_part_t>(incoming);
5597 memcpy( this->assigned_part_ids,
5598 received_partids.getRawPtr(),
5599 incoming *
sizeof(mj_part_t));
5601 this->num_local_coords = incoming;
5602 this->mj_env->timerStop(
MACRO_TIMERS,
"MultiJagged - Final DistributorPlanComm");
5607 this->mj_env->timerStop(
MACRO_TIMERS,
"MultiJagged - Part_Assignment");
5609 this->mj_env->timerStart(
MACRO_TIMERS,
"MultiJagged - Solution_Part_Assignment");
5614 if (this->mj_keep_part_boxes){
5615 this->kept_boxes = compute_global_box_boundaries(output_part_boxes);
5619 this->mj_env->timerStop(
MACRO_TIMERS,
"MultiJagged - Solution_Part_Assignment");
5624 template <
typename mj_scalar_t,
typename mj_lno_t,
typename mj_gno_t,
5626 void AlgMJ<mj_scalar_t, mj_lno_t, mj_gno_t, mj_part_t>::free_work_memory(){
5627 this->mj_env->timerStart(
MACRO_TIMERS,
"MultiJagged - Problem_Free");
5629 for (
int i=0; i < this->coord_dim; i++){
5630 freeArray<mj_scalar_t>(this->mj_coordinates[i]);
5632 freeArray<mj_scalar_t *>(this->mj_coordinates);
5634 for (
int i=0; i < this->num_weights_per_coord; i++){
5635 freeArray<mj_scalar_t>(this->mj_weights[i]);
5637 freeArray<mj_scalar_t *>(this->mj_weights);
5639 freeArray<int>(this->owner_of_coordinate);
5641 for(
int i = 0; i < this->num_threads; ++i){
5642 freeArray<mj_lno_t>(this->thread_point_counts[i]);
5645 freeArray<mj_lno_t *>(this->thread_point_counts);
5646 freeArray<double *> (this->thread_part_weight_work);
5648 if(this->distribute_points_on_cut_lines){
5649 freeArray<mj_scalar_t>(this->process_cut_line_weight_to_put_left);
5650 for(
int i = 0; i < this->num_threads; ++i){
5651 freeArray<mj_scalar_t>(this->thread_cut_line_weight_to_put_left[i]);
5653 freeArray<mj_scalar_t *>(this->thread_cut_line_weight_to_put_left);
5654 freeArray<mj_scalar_t>(this->process_rectilinear_cut_weight);
5655 freeArray<mj_scalar_t>(this->global_rectilinear_cut_weight);
5658 freeArray<mj_part_t>(this->my_incomplete_cut_count);
5660 freeArray<mj_scalar_t>(this->max_min_coords);
5662 freeArray<mj_lno_t>(this->part_xadj);
5664 freeArray<mj_lno_t>(this->coordinate_permutations);
5666 freeArray<mj_lno_t>(this->new_coordinate_permutations);
5668 freeArray<mj_scalar_t>(this->all_cut_coordinates);
5670 freeArray<mj_scalar_t> (this->process_local_min_max_coord_total_weight);
5672 freeArray<mj_scalar_t> (this->global_min_max_coord_total_weight);
5674 freeArray<mj_scalar_t>(this->cut_coordinates_work_array);
5676 freeArray<mj_scalar_t>(this->target_part_weights);
5678 freeArray<mj_scalar_t>(this->cut_upper_bound_coordinates);
5680 freeArray<mj_scalar_t>(this->cut_lower_bound_coordinates);
5682 freeArray<mj_scalar_t>(this->cut_lower_bound_weights);
5683 freeArray<mj_scalar_t>(this->cut_upper_bound_weights);
5684 freeArray<bool>(this->is_cut_line_determined);
5685 freeArray<mj_scalar_t>(this->total_part_weight_left_right_closests);
5686 freeArray<mj_scalar_t>(this->global_total_part_weight_left_right_closests);
5688 for(
int i = 0; i < this->num_threads; ++i){
5689 freeArray<double>(this->thread_part_weights[i]);
5690 freeArray<mj_scalar_t>(this->thread_cut_right_closest_point[i]);
5691 freeArray<mj_scalar_t>(this->thread_cut_left_closest_point[i]);
5694 freeArray<double *>(this->thread_part_weights);
5695 freeArray<mj_scalar_t *>(this->thread_cut_left_closest_point);
5696 freeArray<mj_scalar_t *>(this->thread_cut_right_closest_point);
5698 this->mj_env->timerStop(
MACRO_TIMERS,
"MultiJagged - Problem_Free");
5708 template <
typename mj_scalar_t,
typename mj_lno_t,
typename mj_gno_t,
5711 bool distribute_points_on_cut_lines_,
5712 int max_concurrent_part_calculation_,
5713 int check_migrate_avoid_migration_option_,
5714 mj_scalar_t minimum_migration_imbalance_){
5715 this->distribute_points_on_cut_lines = distribute_points_on_cut_lines_;
5716 this->max_concurrent_part_calculation = max_concurrent_part_calculation_;
5717 this->check_migrate_avoid_migration_option = check_migrate_avoid_migration_option_;
5718 this->minimum_migration_imbalance = minimum_migration_imbalance_;
5751 template <
typename mj_scalar_t,
typename mj_lno_t,
typename mj_gno_t,
5755 const RCP<const Environment> &env,
5756 RCP<
const Comm<int> > &problemComm,
5758 double imbalance_tolerance_,
5759 size_t num_global_parts_,
5760 mj_part_t *part_no_array_,
5761 int recursion_depth_,
5764 mj_lno_t num_local_coords_,
5765 mj_gno_t num_global_coords_,
5766 const mj_gno_t *initial_mj_gnos_,
5767 mj_scalar_t **mj_coordinates_,
5769 int num_weights_per_coord_,
5770 bool *mj_uniform_weights_,
5771 mj_scalar_t **mj_weights_,
5772 bool *mj_uniform_parts_,
5773 mj_scalar_t **mj_part_sizes_,
5775 mj_part_t *&result_assigned_part_ids_,
5776 mj_gno_t *&result_mj_gnos_
5781 if(comm->getRank() == 0){
5782 std::cout <<
"size of gno:" <<
sizeof(mj_gno_t) << std::endl;
5783 std::cout <<
"size of lno:" <<
sizeof(mj_lno_t) << std::endl;
5784 std::cout <<
"size of mj_scalar_t:" <<
sizeof(mj_scalar_t) << std::endl;
5789 this->mj_problemComm = problemComm;
5790 this->myActualRank = this->myRank = this->mj_problemComm->getRank();
5792 this->mj_env->timerStart(
MACRO_TIMERS,
"MultiJagged - Total");
5793 this->mj_env->debug(3,
"In MultiJagged Jagged");
5796 this->imbalance_tolerance = imbalance_tolerance_;
5797 this->num_global_parts = num_global_parts_;
5798 this->part_no_array = part_no_array_;
5799 this->recursion_depth = recursion_depth_;
5801 this->coord_dim = coord_dim_;
5802 this->num_local_coords = num_local_coords_;
5803 this->num_global_coords = num_global_coords_;
5804 this->mj_coordinates = mj_coordinates_;
5805 this->initial_mj_gnos = (mj_gno_t *) initial_mj_gnos_;
5807 this->num_weights_per_coord = num_weights_per_coord_;
5808 this->mj_uniform_weights = mj_uniform_weights_;
5809 this->mj_weights = mj_weights_;
5810 this->mj_uniform_parts = mj_uniform_parts_;
5811 this->mj_part_sizes = mj_part_sizes_;
5813 this->num_threads = 1;
5814 #ifdef HAVE_ZOLTAN2_OMP 5815 #pragma omp parallel 5818 this->num_threads = omp_get_num_threads();
5823 this->set_part_specifications();
5825 this->allocate_set_work_memory();
5829 this->comm = this->mj_problemComm->duplicate();
5832 mj_part_t current_num_parts = 1;
5833 mj_scalar_t *current_cut_coordinates = this->all_cut_coordinates;
5835 this->mj_env->timerStart(
MACRO_TIMERS,
"MultiJagged - Problem_Partitioning");
5837 mj_part_t output_part_begin_index = 0;
5838 mj_part_t future_num_parts = this->total_num_part;
5839 bool is_data_ever_migrated =
false;
5841 std::vector<mj_part_t> *future_num_part_in_parts =
new std::vector<mj_part_t> ();
5842 std::vector<mj_part_t> *next_future_num_parts_in_parts =
new std::vector<mj_part_t> ();
5843 next_future_num_parts_in_parts->push_back(this->num_global_parts);
5845 RCP<mj_partBoxVector_t> input_part_boxes(
new mj_partBoxVector_t(),
true) ;
5846 RCP<mj_partBoxVector_t> output_part_boxes(
new mj_partBoxVector_t(),
true);
5848 compute_global_box();
5849 if(this->mj_keep_part_boxes){
5850 this->init_part_boxes(output_part_boxes);
5853 for (
int i = 0; i < this->recursion_depth; ++i){
5856 std::vector <mj_part_t> num_partitioning_in_current_dim;
5870 std::vector<mj_part_t> *tmpPartVect= future_num_part_in_parts;
5871 future_num_part_in_parts = next_future_num_parts_in_parts;
5872 next_future_num_parts_in_parts = tmpPartVect;
5877 next_future_num_parts_in_parts->clear();
5879 if(this->mj_keep_part_boxes){
5880 RCP<mj_partBoxVector_t> tmpPartBoxes = input_part_boxes;
5881 input_part_boxes = output_part_boxes;
5882 output_part_boxes = tmpPartBoxes;
5883 output_part_boxes->clear();
5887 mj_part_t output_part_count_in_dimension =
5888 this->update_part_num_arrays(
5889 num_partitioning_in_current_dim,
5890 future_num_part_in_parts,
5891 next_future_num_parts_in_parts,
5901 if(output_part_count_in_dimension == current_num_parts) {
5903 tmpPartVect= future_num_part_in_parts;
5904 future_num_part_in_parts = next_future_num_parts_in_parts;
5905 next_future_num_parts_in_parts = tmpPartVect;
5907 if(this->mj_keep_part_boxes){
5908 RCP<mj_partBoxVector_t> tmpPartBoxes = input_part_boxes;
5909 input_part_boxes = output_part_boxes;
5910 output_part_boxes = tmpPartBoxes;
5917 int coordInd = i % this->coord_dim;
5918 mj_scalar_t * mj_current_dim_coords = this->mj_coordinates[coordInd];
5921 std::string istring = Teuchos::toString<int>(i);
5922 this->mj_env->timerStart(
MACRO_TIMERS,
"MultiJagged - Problem_Partitioning_" + istring);
5926 this->new_part_xadj = allocMemory<mj_lno_t>(output_part_count_in_dimension);
5929 mj_part_t output_part_index = 0;
5932 mj_part_t output_coordinate_end_index = 0;
5934 mj_part_t current_work_part = 0;
5935 mj_part_t current_concurrent_num_parts =
5936 std::min(current_num_parts - current_work_part, this->max_concurrent_part_calculation);
5938 mj_part_t obtained_part_index = 0;
5941 for (; current_work_part < current_num_parts;
5942 current_work_part += current_concurrent_num_parts){
5944 current_concurrent_num_parts = std::min(current_num_parts - current_work_part,
5945 this->max_concurrent_part_calculation);
5947 mj_part_t actual_work_part_count = 0;
5951 for(
int kk = 0; kk < current_concurrent_num_parts; ++kk){
5952 mj_part_t current_work_part_in_concurrent_parts = current_work_part + kk;
5956 if (num_partitioning_in_current_dim[current_work_part_in_concurrent_parts] == 1){
5959 ++actual_work_part_count;
5960 mj_lno_t coordinate_end_index= this->part_xadj[current_work_part_in_concurrent_parts];
5961 mj_lno_t coordinate_begin_index = current_work_part_in_concurrent_parts==0 ? 0: this->part_xadj[current_work_part_in_concurrent_parts -1];
5969 this->mj_get_local_min_max_coord_totW(
5970 coordinate_begin_index,
5971 coordinate_end_index,
5972 this->coordinate_permutations,
5973 mj_current_dim_coords,
5974 this->process_local_min_max_coord_total_weight[kk],
5975 this->process_local_min_max_coord_total_weight[kk + current_concurrent_num_parts],
5976 this->process_local_min_max_coord_total_weight[kk + 2*current_concurrent_num_parts]);
5981 if (actual_work_part_count > 0){
5983 this->mj_get_global_min_max_coord_totW(
5984 current_concurrent_num_parts,
5985 this->process_local_min_max_coord_total_weight,
5986 this->global_min_max_coord_total_weight);
5990 mj_part_t total_incomplete_cut_count = 0;
5995 mj_part_t concurrent_part_cut_shift = 0;
5996 mj_part_t concurrent_part_part_shift = 0;
5997 for(
int kk = 0; kk < current_concurrent_num_parts; ++kk){
5998 mj_scalar_t min_coordinate = this->global_min_max_coord_total_weight[kk];
5999 mj_scalar_t max_coordinate = this->global_min_max_coord_total_weight[kk +
6000 current_concurrent_num_parts];
6002 mj_scalar_t global_total_weight = this->global_min_max_coord_total_weight[kk +
6003 2 * current_concurrent_num_parts];
6005 mj_part_t concurrent_current_part_index = current_work_part + kk;
6007 mj_part_t partition_count = num_partitioning_in_current_dim[concurrent_current_part_index];
6009 mj_scalar_t *usedCutCoordinate = current_cut_coordinates + concurrent_part_cut_shift;
6010 mj_scalar_t *current_target_part_weights = this->target_part_weights +
6011 concurrent_part_part_shift;
6013 concurrent_part_cut_shift += partition_count - 1;
6015 concurrent_part_part_shift += partition_count;
6020 if(partition_count > 1 && min_coordinate <= max_coordinate){
6024 total_incomplete_cut_count += partition_count - 1;
6027 this->my_incomplete_cut_count[kk] = partition_count - 1;
6030 this->mj_get_initial_cut_coords_target_weights(
6033 partition_count - 1,
6034 global_total_weight,
6036 current_target_part_weights,
6037 future_num_part_in_parts,
6038 next_future_num_parts_in_parts,
6039 concurrent_current_part_index,
6040 obtained_part_index);
6042 mj_lno_t coordinate_end_index= this->part_xadj[concurrent_current_part_index];
6043 mj_lno_t coordinate_begin_index = concurrent_current_part_index==0 ? 0: this->part_xadj[concurrent_current_part_index -1];
6047 this->set_initial_coordinate_parts(
6050 concurrent_current_part_index,
6051 coordinate_begin_index, coordinate_end_index,
6052 this->coordinate_permutations,
6053 mj_current_dim_coords,
6054 this->assigned_part_ids,
6059 this->my_incomplete_cut_count[kk] = 0;
6061 obtained_part_index += partition_count;
6068 mj_scalar_t used_imbalance = 0;
6073 mj_current_dim_coords,
6076 current_concurrent_num_parts,
6077 current_cut_coordinates,
6078 total_incomplete_cut_count,
6079 num_partitioning_in_current_dim);
6084 mj_part_t output_array_shift = 0;
6085 mj_part_t cut_shift = 0;
6086 size_t tlr_shift = 0;
6087 size_t partweight_array_shift = 0;
6089 for(
int kk = 0; kk < current_concurrent_num_parts; ++kk){
6090 mj_part_t current_concurrent_work_part = current_work_part + kk;
6091 mj_part_t num_parts = num_partitioning_in_current_dim[current_concurrent_work_part];
6094 if((num_parts != 1 )
6096 this->global_min_max_coord_total_weight[kk] >
6097 this->global_min_max_coord_total_weight[kk + current_concurrent_num_parts]) {
6101 for(mj_part_t jj = 0; jj < num_parts; ++jj){
6102 this->new_part_xadj[output_part_index + output_array_shift + jj] = 0;
6104 cut_shift += num_parts - 1;
6105 tlr_shift += (4 *(num_parts - 1) + 1);
6106 output_array_shift += num_parts;
6107 partweight_array_shift += (2 * (num_parts - 1) + 1);
6111 mj_lno_t coordinate_end= this->part_xadj[current_concurrent_work_part];
6112 mj_lno_t coordinate_begin = current_concurrent_work_part==0 ? 0: this->part_xadj[
6113 current_concurrent_work_part -1];
6114 mj_scalar_t *current_concurrent_cut_coordinate = current_cut_coordinates + cut_shift;
6115 mj_scalar_t *used_local_cut_line_weight_to_left = this->process_cut_line_weight_to_put_left +
6120 for(
int ii = 0; ii < this->num_threads; ++ii){
6121 this->thread_part_weight_work[ii] = this->thread_part_weights[ii] + partweight_array_shift;
6125 if(this->mj_keep_part_boxes){
6127 for (mj_part_t j = 0; j < num_parts - 1; ++j){
6128 (*output_part_boxes)[output_array_shift + output_part_index +
6129 j].updateMinMax(current_concurrent_cut_coordinate[j], 1
6132 (*output_part_boxes)[output_array_shift + output_part_index + j +
6133 1].updateMinMax(current_concurrent_cut_coordinate[j], 0
6139 this->mj_create_new_partitions(
6141 mj_current_dim_coords,
6142 current_concurrent_cut_coordinate,
6145 used_local_cut_line_weight_to_left,
6146 this->thread_part_weight_work,
6147 this->new_part_xadj + output_part_index + output_array_shift
6154 mj_lno_t part_size = coordinate_end - coordinate_begin;
6155 *(this->new_part_xadj + output_part_index + output_array_shift) = part_size;
6157 this->new_coordinate_permutations + coordinate_begin,
6158 this->coordinate_permutations + coordinate_begin,
6159 part_size *
sizeof(mj_lno_t));
6161 cut_shift += num_parts - 1;
6162 tlr_shift += (4 *(num_parts - 1) + 1);
6163 output_array_shift += num_parts;
6164 partweight_array_shift += (2 * (num_parts - 1) + 1);
6174 for(mj_part_t kk = 0; kk < current_concurrent_num_parts; ++kk){
6175 mj_part_t num_parts = num_partitioning_in_current_dim[ current_work_part + kk];
6176 for (mj_part_t ii = 0;ii < num_parts ; ++ii){
6178 this->new_part_xadj[output_part_index+ii] += output_coordinate_end_index;
6181 output_coordinate_end_index = this->new_part_xadj[output_part_index + num_parts - 1];
6183 output_part_index += num_parts ;
6190 int current_world_size = this->comm->getSize();
6191 long migration_reduce_all_population = this->total_dim_num_reduce_all * current_world_size;
6194 bool is_migrated_in_current_dimension =
false;
6199 if (future_num_parts > 1 &&
6200 this->check_migrate_avoid_migration_option >= 0 &&
6201 current_world_size > 1){
6203 this->mj_env->timerStart(
MACRO_TIMERS,
"MultiJagged - Problem_Migration-" + istring);
6204 mj_part_t num_parts = output_part_count_in_dimension;
6205 if ( this->mj_perform_migration(
6208 next_future_num_parts_in_parts,
6209 output_part_begin_index,
6210 migration_reduce_all_population,
6211 this->num_local_coords / (future_num_parts * current_num_parts),
6213 input_part_boxes, output_part_boxes) ) {
6214 is_migrated_in_current_dimension =
true;
6215 is_data_ever_migrated =
true;
6216 this->mj_env->timerStop(
MACRO_TIMERS,
"MultiJagged - Problem_Migration-" +
6219 this->total_dim_num_reduce_all /= num_parts;
6222 is_migrated_in_current_dimension =
false;
6223 this->mj_env->timerStop(
MACRO_TIMERS,
"MultiJagged - Problem_Migration-" + istring);
6228 mj_lno_t * tmp = this->coordinate_permutations;
6229 this->coordinate_permutations = this->new_coordinate_permutations;
6230 this->new_coordinate_permutations = tmp;
6232 if(!is_migrated_in_current_dimension){
6233 this->total_dim_num_reduce_all -= current_num_parts;
6234 current_num_parts = output_part_count_in_dimension;
6236 freeArray<mj_lno_t>(this->part_xadj);
6237 this->part_xadj = this->new_part_xadj;
6238 this->new_part_xadj = NULL;
6239 this->mj_env->timerStop(
MACRO_TIMERS,
"MultiJagged - Problem_Partitioning_" + istring);
6243 delete future_num_part_in_parts;
6244 delete next_future_num_parts_in_parts;
6246 this->mj_env->timerStop(
MACRO_TIMERS,
"MultiJagged - Problem_Partitioning");
6253 this->set_final_parts(
6255 output_part_begin_index,
6257 is_data_ever_migrated);
6259 result_assigned_part_ids_ = this->assigned_part_ids;
6260 result_mj_gnos_ = this->current_mj_gnos;
6262 this->free_work_memory();
6263 this->mj_env->timerStop(
MACRO_TIMERS,
"MultiJagged - Total");
6264 this->mj_env->debug(3,
"Out of MultiJagged");
6272 template <
typename Adapter>
6277 #ifndef DOXYGEN_SHOULD_SKIP_THIS 6280 typedef typename Adapter::scalar_t mj_scalar_t;
6281 typedef typename Adapter::gno_t mj_gno_t;
6282 typedef typename Adapter::lno_t mj_lno_t;
6283 typedef typename Adapter::node_t mj_node_t;
6284 typedef typename Adapter::part_t mj_part_t;
6286 typedef std::vector<mj_partBox_t> mj_partBoxVector_t;
6290 RCP<const Environment> mj_env;
6291 RCP<const Comm<int> > mj_problemComm;
6292 RCP<const coordinateModel_t> mj_coords;
6295 double imbalance_tolerance;
6296 size_t num_global_parts;
6297 mj_part_t *part_no_array;
6298 int recursion_depth;
6301 mj_lno_t num_local_coords;
6302 mj_gno_t num_global_coords;
6303 const mj_gno_t *initial_mj_gnos;
6304 mj_scalar_t **mj_coordinates;
6306 int num_weights_per_coord;
6307 bool *mj_uniform_weights;
6308 mj_scalar_t **mj_weights;
6309 bool *mj_uniform_parts;
6310 mj_scalar_t **mj_part_sizes;
6312 bool distribute_points_on_cut_lines;
6313 mj_part_t max_concurrent_part_calculation;
6314 int check_migrate_avoid_migration_option;
6315 mj_scalar_t minimum_migration_imbalance;
6316 bool mj_keep_part_boxes;
6322 ArrayRCP<mj_part_t> comXAdj_;
6323 ArrayRCP<mj_part_t> comAdj_;
6329 ArrayRCP<const mj_scalar_t> * coordinate_ArrayRCP_holder;
6331 void set_up_partitioning_data(
6334 void set_input_parameters(
const Teuchos::ParameterList &p);
6336 void free_work_memory();
6338 RCP<mj_partBoxVector_t> getGlobalBoxBoundaries()
const;
6343 RCP<
const Comm<int> > &problemComm,
6344 const RCP<const coordinateModel_t> &coords) :
6345 mj_partitioner(), mj_env(env),
6346 mj_problemComm(problemComm),
6348 imbalance_tolerance(0),
6349 num_global_parts(1), part_no_array(NULL),
6351 coord_dim(0),num_local_coords(0), num_global_coords(0),
6352 initial_mj_gnos(NULL), mj_coordinates(NULL),
6353 num_weights_per_coord(0),
6354 mj_uniform_weights(NULL), mj_weights(NULL),
6355 mj_uniform_parts(NULL),
6356 mj_part_sizes(NULL),
6357 distribute_points_on_cut_lines(true),
6358 max_concurrent_part_calculation(1),
6359 check_migrate_avoid_migration_option(0),
6360 minimum_migration_imbalance(0.30),
6361 mj_keep_part_boxes(false), num_threads(1), mj_run_as_rcb(false),
6362 comXAdj_(), comAdj_(), coordinate_ArrayRCP_holder (NULL)
6365 if (coordinate_ArrayRCP_holder != NULL){
6366 delete [] this->coordinate_ArrayRCP_holder;
6367 this->coordinate_ArrayRCP_holder = NULL;
6375 const bool bUnsorted =
true;
6376 RCP<Zoltan2::IntegerRangeListValidator<int>> mj_parts_Validator =
6378 pl.set(
"mj_parts",
"0",
"list of parts for multiJagged partitioning " 6379 "algorithm. As many as the dimension count.", mj_parts_Validator);
6381 pl.set(
"mj_concurrent_part_count", 1,
"The number of parts whose cut " 6384 pl.set(
"mj_minimum_migration_imbalance", 1.1,
6385 "mj_minimum_migration_imbalance, the minimum imbalance of the " 6386 "processors to avoid migration",
6389 RCP<Teuchos::EnhancedNumberValidator<int>> mj_migration_option_validator =
6390 Teuchos::rcp(
new Teuchos::EnhancedNumberValidator<int>(0, 2) );
6391 pl.set(
"mj_migration_option", 1,
"Migration option, 0 for decision " 6392 "depending on the imbalance, 1 for forcing migration, 2 for " 6393 "avoiding migration", mj_migration_option_validator);
6396 pl.set(
"mj_keep_part_boxes",
false,
"Keep the part boundaries of the " 6400 pl.set(
"mj_enable_rcb",
false,
"Use MJ as RCB.",
6403 pl.set(
"mj_recursion_depth", -1,
"Recursion depth for MJ: Must be " 6417 RCP<mj_partBoxVector_t> pBoxes = this->getGlobalBoxBoundaries();
6421 mj_part_t
pointAssign(
int dim, mj_scalar_t *point)
const;
6423 void boxAssign(
int dim, mj_scalar_t *lower, mj_scalar_t *upper,
6424 size_t &nPartsFound, mj_part_t **partsFound)
const;
6431 ArrayRCP<mj_part_t> &comXAdj,
6432 ArrayRCP<mj_part_t> &comAdj);
6445 template <
typename Adapter>
6450 this->set_up_partitioning_data(solution);
6451 this->set_input_parameters(this->mj_env->getParameters());
6452 if (this->mj_keep_part_boxes){
6453 this->mj_partitioner.set_to_keep_part_boxes();
6455 this->mj_partitioner.set_partitioning_parameters(
6456 this->distribute_points_on_cut_lines,
6457 this->max_concurrent_part_calculation,
6458 this->check_migrate_avoid_migration_option,
6459 this->minimum_migration_imbalance);
6461 mj_part_t *result_assigned_part_ids = NULL;
6462 mj_gno_t *result_mj_gnos = NULL;
6463 this->mj_partitioner.multi_jagged_part(
6465 this->mj_problemComm,
6467 this->imbalance_tolerance,
6468 this->num_global_parts,
6469 this->part_no_array,
6470 this->recursion_depth,
6473 this->num_local_coords,
6474 this->num_global_coords,
6475 this->initial_mj_gnos,
6476 this->mj_coordinates,
6478 this->num_weights_per_coord,
6479 this->mj_uniform_weights,
6481 this->mj_uniform_parts,
6482 this->mj_part_sizes,
6484 result_assigned_part_ids,
6489 #if defined(__cplusplus) && __cplusplus >= 201103L 6490 std::unordered_map<mj_gno_t, mj_lno_t> localGidToLid;
6491 localGidToLid.reserve(this->num_local_coords);
6492 for (mj_lno_t i = 0; i < this->num_local_coords; i++)
6493 localGidToLid[this->initial_mj_gnos[i]] = i;
6495 ArrayRCP<mj_part_t> partId = arcp(
new mj_part_t[this->num_local_coords],
6496 0, this->num_local_coords,
true);
6498 for (mj_lno_t i = 0; i < this->num_local_coords; i++) {
6499 mj_lno_t origLID = localGidToLid[result_mj_gnos[i]];
6500 partId[origLID] = result_assigned_part_ids[i];
6504 Teuchos::Hashtable<mj_gno_t, mj_lno_t>
6505 localGidToLid(this->num_local_coords);
6506 for (mj_lno_t i = 0; i < this->num_local_coords; i++)
6507 localGidToLid.put(this->initial_mj_gnos[i], i);
6509 ArrayRCP<mj_part_t> partId = arcp(
new mj_part_t[this->num_local_coords],
6510 0, this->num_local_coords,
true);
6512 for (mj_lno_t i = 0; i < this->num_local_coords; i++) {
6513 mj_lno_t origLID = localGidToLid.get(result_mj_gnos[i]);
6514 partId[origLID] = result_assigned_part_ids[i];
6517 #endif // C++11 is enabled 6519 delete [] result_mj_gnos;
6520 delete [] result_assigned_part_ids;
6522 solution->setParts(partId);
6523 this->free_work_memory();
6528 template <
typename Adapter>
6530 freeArray<mj_scalar_t *>(this->mj_coordinates);
6531 freeArray<mj_scalar_t *>(this->mj_weights);
6532 freeArray<bool>(this->mj_uniform_parts);
6533 freeArray<mj_scalar_t *>(this->mj_part_sizes);
6534 freeArray<bool>(this->mj_uniform_weights);
6540 template <
typename Adapter>
6541 void Zoltan2_AlgMJ<Adapter>::set_up_partitioning_data(
6542 const RCP<PartitioningSolution<Adapter> > &solution
6545 this->coord_dim = this->mj_coords->getCoordinateDim();
6546 this->num_weights_per_coord = this->mj_coords->getNumWeightsPerCoordinate();
6547 this->num_local_coords = this->mj_coords->getLocalNumCoordinates();
6548 this->num_global_coords = this->mj_coords->getGlobalNumCoordinates();
6549 int criteria_dim = (this->num_weights_per_coord ? this->num_weights_per_coord : 1);
6554 this->num_global_parts = solution->getTargetGlobalNumberOfParts();
6557 this->mj_coordinates = allocMemory<mj_scalar_t *>(this->coord_dim);
6558 this->mj_weights = allocMemory<mj_scalar_t *>(criteria_dim);
6561 this->mj_uniform_parts = allocMemory< bool >(criteria_dim);
6564 this->mj_part_sizes = allocMemory<mj_scalar_t *>(criteria_dim);
6566 this->mj_uniform_weights = allocMemory< bool >(criteria_dim);
6568 typedef StridedData<mj_lno_t, mj_scalar_t> input_t;
6569 ArrayView<const mj_gno_t> gnos;
6570 ArrayView<input_t> xyz;
6571 ArrayView<input_t> wgts;
6574 this->coordinate_ArrayRCP_holder =
new ArrayRCP<const mj_scalar_t> [this->coord_dim + this->num_weights_per_coord];
6576 this->mj_coords->getCoordinates(gnos, xyz, wgts);
6578 ArrayView<const mj_gno_t> mj_gnos = gnos;
6579 this->initial_mj_gnos = mj_gnos.getRawPtr();
6582 for (
int dim=0; dim < this->coord_dim; dim++){
6583 ArrayRCP<const mj_scalar_t> ar;
6584 xyz[dim].getInputArray(ar);
6585 this->coordinate_ArrayRCP_holder[dim] = ar;
6588 this->mj_coordinates[dim] = (mj_scalar_t *)ar.getRawPtr();
6592 if (this->num_weights_per_coord == 0){
6593 this->mj_uniform_weights[0] =
true;
6594 this->mj_weights[0] = NULL;
6598 for (
int wdim = 0; wdim < this->num_weights_per_coord; wdim++){
6599 ArrayRCP<const mj_scalar_t> ar;
6600 wgts[wdim].getInputArray(ar);
6601 this->coordinate_ArrayRCP_holder[this->coord_dim + wdim] = ar;
6602 this->mj_uniform_weights[wdim] =
false;
6603 this->mj_weights[wdim] = (mj_scalar_t *) ar.getRawPtr();
6607 for (
int wdim = 0; wdim < criteria_dim; wdim++){
6608 if (solution->criteriaHasUniformPartSizes(wdim)){
6609 this->mj_uniform_parts[wdim] =
true;
6610 this->mj_part_sizes[wdim] = NULL;
6613 std::cerr <<
"MJ does not support non uniform target part weights" << std::endl;
6622 template <
typename Adapter>
6623 void Zoltan2_AlgMJ<Adapter>::set_input_parameters(
const Teuchos::ParameterList &pl){
6625 const Teuchos::ParameterEntry *pe = pl.getEntryPtr(
"imbalance_tolerance");
6628 tol = pe->getValue(&tol);
6629 this->imbalance_tolerance = tol - 1.0;
6633 if (this->imbalance_tolerance <= 0)
6634 this->imbalance_tolerance= 10e-4;
6637 this->part_no_array = NULL;
6639 this->recursion_depth = 0;
6641 if (pl.getPtr<Array <mj_part_t> >(
"mj_parts")){
6642 this->part_no_array = (mj_part_t *) pl.getPtr<Array <mj_part_t> >(
"mj_parts")->getRawPtr();
6643 this->recursion_depth = pl.getPtr<Array <mj_part_t> >(
"mj_parts")->size() - 1;
6644 this->mj_env->debug(2,
"mj_parts provided by user");
6648 this->distribute_points_on_cut_lines =
true;
6649 this->max_concurrent_part_calculation = 1;
6651 this->mj_run_as_rcb =
false;
6652 int mj_user_recursion_depth = -1;
6653 this->mj_keep_part_boxes =
false;
6654 this->check_migrate_avoid_migration_option = 0;
6655 this->minimum_migration_imbalance = 0.35;
6657 pe = pl.getEntryPtr(
"mj_minimum_migration_imbalance");
6660 imb = pe->getValue(&imb);
6661 this->minimum_migration_imbalance = imb - 1.0;
6664 pe = pl.getEntryPtr(
"mj_migration_option");
6666 this->check_migrate_avoid_migration_option = pe->getValue(&this->check_migrate_avoid_migration_option);
6668 this->check_migrate_avoid_migration_option = 0;
6670 if (this->check_migrate_avoid_migration_option > 1) this->check_migrate_avoid_migration_option = -1;
6673 pe = pl.getEntryPtr(
"mj_concurrent_part_count");
6675 this->max_concurrent_part_calculation = pe->getValue(&this->max_concurrent_part_calculation);
6677 this->max_concurrent_part_calculation = 1;
6680 pe = pl.getEntryPtr(
"mj_keep_part_boxes");
6682 this->mj_keep_part_boxes = pe->getValue(&this->mj_keep_part_boxes);
6684 this->mj_keep_part_boxes =
false;
6696 if (this->mj_keep_part_boxes ==
false){
6697 pe = pl.getEntryPtr(
"mapping_type");
6699 int mapping_type = -1;
6700 mapping_type = pe->getValue(&mapping_type);
6701 if (mapping_type == 0){
6702 mj_keep_part_boxes =
true;
6708 pe = pl.getEntryPtr(
"mj_enable_rcb");
6710 this->mj_run_as_rcb = pe->getValue(&this->mj_run_as_rcb);
6712 this->mj_run_as_rcb =
false;
6715 pe = pl.getEntryPtr(
"mj_recursion_depth");
6717 mj_user_recursion_depth = pe->getValue(&mj_user_recursion_depth);
6719 mj_user_recursion_depth = -1;
6723 pe = pl.getEntryPtr(
"rectilinear");
6724 if (pe) val = pe->getValue(&val);
6726 this->distribute_points_on_cut_lines =
false;
6728 this->distribute_points_on_cut_lines =
true;
6731 if (this->mj_run_as_rcb){
6732 mj_user_recursion_depth = (int)(ceil(log ((this->num_global_parts)) / log (2.0)));
6734 if (this->recursion_depth < 1){
6735 if (mj_user_recursion_depth > 0){
6736 this->recursion_depth = mj_user_recursion_depth;
6739 this->recursion_depth = this->coord_dim;
6743 this->num_threads = 1;
6744 #ifdef HAVE_ZOLTAN2_OMP 6745 #pragma omp parallel 6747 this->num_threads = omp_get_num_threads();
6754 template <
typename Adapter>
6757 typename Adapter::scalar_t *lower,
6758 typename Adapter::scalar_t *upper,
6759 size_t &nPartsFound,
6760 typename Adapter::part_t **partsFound)
const 6769 if (this->mj_keep_part_boxes) {
6772 RCP<mj_partBoxVector_t> partBoxes = this->getGlobalBoxBoundaries();
6774 size_t nBoxes = (*partBoxes).size();
6776 throw std::logic_error(
"no part boxes exist");
6780 RCP<mj_partBox_t> globalBox = this->mj_partitioner.get_global_box();
6782 if (globalBox->boxesOverlap(dim, lower, upper)) {
6784 std::vector<typename Adapter::part_t> partlist;
6787 for (
size_t i = 0; i < nBoxes; i++) {
6789 if ((*partBoxes)[i].boxesOverlap(dim, lower, upper)) {
6791 partlist.push_back((*partBoxes)[i].getpId());
6812 *partsFound =
new mj_part_t[nPartsFound];
6813 for (
size_t i = 0; i < nPartsFound; i++)
6814 (*partsFound)[i] = partlist[i];
6827 throw std::logic_error(
"need to use keep_cuts parameter for boxAssign");
6832 template <
typename Adapter>
6835 typename Adapter::scalar_t *point)
const 6842 if (this->mj_keep_part_boxes) {
6843 typename Adapter::part_t foundPart = -1;
6846 RCP<mj_partBoxVector_t> partBoxes = this->getGlobalBoxBoundaries();
6848 size_t nBoxes = (*partBoxes).size();
6850 throw std::logic_error(
"no part boxes exist");
6854 RCP<mj_partBox_t> globalBox = this->mj_partitioner.get_global_box();
6856 if (globalBox->pointInBox(dim, point)) {
6860 for (i = 0; i < nBoxes; i++) {
6862 if ((*partBoxes)[i].pointInBox(dim, point)) {
6863 foundPart = (*partBoxes)[i].getpId();
6877 std::ostringstream oss;
6879 for (
int j = 0; j < dim; j++) oss << point[j] <<
" ";
6880 oss <<
") not found in domain";
6881 throw std::logic_error(oss.str());
6890 size_t closestBox = 0;
6891 mj_scalar_t minDistance = std::numeric_limits<mj_scalar_t>::max();
6892 mj_scalar_t *centroid =
new mj_scalar_t[dim];
6893 for (
size_t i = 0; i < nBoxes; i++) {
6894 (*partBoxes)[i].computeCentroid(centroid);
6895 mj_scalar_t sum = 0.;
6897 for (
int j = 0; j < dim; j++) {
6898 diff = centroid[j] - point[j];
6901 if (sum < minDistance) {
6906 foundPart = (*partBoxes)[closestBox].getpId();
6913 throw std::logic_error(
"need to use keep_cuts parameter for pointAssign");
6917 template <
typename Adapter>
6923 if(comXAdj_.getRawPtr() == NULL && comAdj_.getRawPtr() == NULL){
6924 RCP<mj_partBoxVector_t> pBoxes = this->getGlobalBoxBoundaries();
6925 mj_part_t ntasks = (*pBoxes).size();
6926 int dim = (*pBoxes)[0].getDim();
6935 template <
typename Adapter>
6936 RCP<typename Zoltan2_AlgMJ<Adapter>::mj_partBoxVector_t>
6939 return this->mj_partitioner.get_kept_boxes();
6943 template <
typename mj_scalar_t,
typename mj_lno_t,
typename mj_gno_t,
6945 RCP<typename AlgMJ<mj_scalar_t,mj_lno_t,mj_gno_t,mj_part_t>::mj_partBoxVector_t>
6948 if (this->mj_keep_part_boxes)
6949 return this->kept_boxes;
6951 throw std::logic_error(
"Error: part boxes are not stored.");
6954 template <
typename mj_scalar_t,
typename mj_lno_t,
typename mj_gno_t,
6956 RCP<typename AlgMJ<mj_scalar_t,mj_lno_t,mj_gno_t,mj_part_t>::mj_partBoxVector_t>
6958 RCP<mj_partBoxVector_t> &localPartBoxes
6961 mj_part_t ntasks = this->num_global_parts;
6962 int dim = (*localPartBoxes)[0].getDim();
6963 mj_scalar_t *localPartBoundaries =
new mj_scalar_t[ntasks * 2 *dim];
6965 memset(localPartBoundaries, 0,
sizeof(mj_scalar_t) * ntasks * 2 *dim);
6967 mj_scalar_t *globalPartBoundaries =
new mj_scalar_t[ntasks * 2 *dim];
6968 memset(globalPartBoundaries, 0,
sizeof(mj_scalar_t) * ntasks * 2 *dim);
6970 mj_scalar_t *localPartMins = localPartBoundaries;
6971 mj_scalar_t *localPartMaxs = localPartBoundaries + ntasks * dim;
6973 mj_scalar_t *globalPartMins = globalPartBoundaries;
6974 mj_scalar_t *globalPartMaxs = globalPartBoundaries + ntasks * dim;
6976 mj_part_t boxCount = localPartBoxes->size();
6977 for (mj_part_t i = 0; i < boxCount; ++i){
6978 mj_part_t pId = (*localPartBoxes)[i].getpId();
6981 mj_scalar_t *lmins = (*localPartBoxes)[i].getlmins();
6982 mj_scalar_t *lmaxs = (*localPartBoxes)[i].getlmaxs();
6984 for (
int j = 0; j < dim; ++j){
6985 localPartMins[dim * pId + j] = lmins[j];
6986 localPartMaxs[dim * pId + j] = lmaxs[j];
6998 reduceAll<int, mj_scalar_t>(*mj_problemComm, reductionOp,
6999 ntasks * 2 *dim, localPartBoundaries, globalPartBoundaries);
7000 RCP<mj_partBoxVector_t> pB(
new mj_partBoxVector_t(),
true);
7001 for (mj_part_t i = 0; i < ntasks; ++i){
7003 globalPartMins + dim * i,
7004 globalPartMaxs + dim * i);
7016 delete []localPartBoundaries;
7017 delete []globalPartBoundaries;
#define MIN_WORK_LAST_DIM
GridHash Class, Hashing Class for part boxes.
Time an algorithm (or other entity) as a whole.
void sequential_task_partitioning(const RCP< const Environment > &env, mj_lno_t num_total_coords, mj_lno_t num_selected_coords, size_t num_target_part, int coord_dim, mj_scalar_t **mj_coordinates, mj_lno_t *initial_selected_coords_output_permutation, mj_lno_t *output_xadj, int recursion_depth, const mj_part_t *part_no_array, bool partition_along_longest_dim)
Special function for partitioning for task mapping. Runs sequential, and performs deterministic parti...
void set_partitioning_parameters(bool distribute_points_on_cut_lines_, int max_concurrent_part_calculation_, int check_migrate_avoid_migration_option_, mj_scalar_t minimum_migration_imbalance_)
Multi Jagged coordinate partitioning algorithm.
bool operator>(const uMultiSortItem< IT, CT, WT > &other) const
#define Z2_FORWARD_EXCEPTIONS
Forward an exception back through call stack.
Defines Parameter related enumerators, declares functions.
AlgMJ()
Multi Jagged coordinate partitioning algorithm default constructor.
static RCP< Teuchos::BoolParameterEntryValidator > getBoolValidator()
Exists to make setting up validators less cluttered.
bool operator>(const uSignedSortItem< IT, WT, SIGN > &rhs) const
void getAdjArrays(ArrayRCP< part_t > &comXAdj_, ArrayRCP< part_t > &comAdj_)
GridHash Class, returns the adj arrays.
Zoltan2_BoxBoundaries(Ordinal s_)
Constructor.
Sort items for quick sort function.
mj_part_t pointAssign(int dim, mj_scalar_t *point) const
void uqSignsort(IT n, uSignedSortItem< IT, WT, SIGN > *arr)
Quick sort function. Sorts the arr of uSignedSortItems, with respect to increasing vals...
#define imbalanceOf2(Wachieved, wExpected)
void freeArray(T *&array)
Frees the given array.
Class for sorting items with multiple values. First sorting with respect to val[0], then val[1] then ... val[count-1]. The last tie breaking is done with index values. Used for task mapping partitioning where the points on a cut line needs to be distributed consistently.
void multi_jagged_part(const RCP< const Environment > &env, RCP< const Comm< int > > &problemComm, double imbalance_tolerance, size_t num_global_parts, mj_part_t *part_no_array, int recursion_depth, int coord_dim, mj_lno_t num_local_coords, mj_gno_t num_global_coords, const mj_gno_t *initial_mj_gnos, mj_scalar_t **mj_coordinates, int num_weights_per_coord, bool *mj_uniform_weights, mj_scalar_t **mj_weights, bool *mj_uniform_parts, mj_scalar_t **mj_part_sizes, mj_part_t *&result_assigned_part_ids, mj_gno_t *&result_mj_gnos)
Multi Jagged coordinate partitioning algorithm.
static RCP< Teuchos::AnyNumberParameterEntryValidator > getAnyDoubleValidator()
Exists to make setting up validators less cluttered.
void partition(const RCP< PartitioningSolution< Adapter > > &solution)
Multi Jagged coordinate partitioning algorithm.
coordinateModelPartBox Class, represents the boundaries of the box which is a result of a geometric p...
A ParameterList validator for integer range lists.
void set_to_keep_part_boxes()
Function call, if the part boxes are intended to be kept.
void getCommunicationGraph(const PartitioningSolution< Adapter > *solution, ArrayRCP< mj_part_t > &comXAdj, ArrayRCP< mj_part_t > &comAdj)
returns communication graph resulting from MJ partitioning.
bool operator>=(const uSignedSortItem< IT, WT, SIGN > &rhs)
#define FUTURE_REDUCEALL_CUTOFF
Multi Jagged coordinate partitioning algorithm.
This class provides geometric coordinates with optional weights to the Zoltan2 algorithm.
T * allocMemory(size_t size)
Allocates memory for the given size.
uMultiSortItem< IT, CT, WT > operator=(const uMultiSortItem< IT, CT, WT > &other)
RCP< mj_partBox_t > get_global_box() const
Return the global bounding box: min/max coords of global domain.
A PartitioningSolution is a solution to a partitioning problem.
Zoltan2_BoxBoundaries()
Default Constructor.
uMultiSortItem(const uMultiSortItem< IT, CT, WT > &other)
#define LEAST_SIGNIFICANCE
RCP< mj_partBoxVector_t > get_kept_boxes() const
mj_partBoxVector_t & getPartBoxesView() const
for partitioning methods, return bounding boxes of the
Zoltan2_BoxBoundaries is a reduction operation to all reduce the all box boundaries.
Algorithm defines the base class for all algorithms.
#define Z2_ASSERT_VALUE(actual, expected)
Throw an error when actual value is not equal to expected value.
static RCP< Teuchos::AnyNumberParameterEntryValidator > getAnyIntValidator()
Exists to make setting up validators less cluttered.
uMultiSortItem(IT index_, CT count_, WT *vals_)
void reduce(const Ordinal count, const T inBuffer[], T inoutBuffer[]) const
Implement Teuchos::ValueTypeReductionOp interface.
Define IntegerRangeList validator.
Defines the CoordinateModel classes.
#define Z2_THROW_OUTSIDE_ERROR(env)
Throw an error returned from outside the Zoltan2 library.
#define ZOLTAN2_ALGMULTIJAGGED_SWAP(a, b, temp)
A gathering of useful namespace methods.
void uqsort(IT n, uSortItem< IT, WT > *arr)
Quick sort function. Sorts the arr of uSortItems, with respect to increasing vals.
Zoltan2_AlgMJ(const RCP< const Environment > &env, RCP< const Comm< int > > &problemComm, const RCP< const coordinateModel_t > &coords)
Contains Teuchos redcution operators for the Multi-jagged algorthm.
void boxAssign(int dim, mj_scalar_t *lower, mj_scalar_t *upper, size_t &nPartsFound, mj_part_t **partsFound) const
RCP< mj_partBoxVector_t > compute_global_box_boundaries(RCP< mj_partBoxVector_t > &localPartBoxes) const
Multi Jagged coordinate partitioning algorithm.
static void getValidParameters(ParameterList &pl)
Set up validators specific to this algorithm.