Zoltan2
Zoltan2_PartitioningSolution.hpp
Go to the documentation of this file.
1 // @HEADER
2 //
3 // ***********************************************************************
4 //
5 // Zoltan2: A package of combinatorial algorithms for scientific computing
6 // Copyright 2012 Sandia Corporation
7 //
8 // Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
9 // the U.S. Government retains certain rights in this software.
10 //
11 // Redistribution and use in source and binary forms, with or without
12 // modification, are permitted provided that the following conditions are
13 // met:
14 //
15 // 1. Redistributions of source code must retain the above copyright
16 // notice, this list of conditions and the following disclaimer.
17 //
18 // 2. Redistributions in binary form must reproduce the above copyright
19 // notice, this list of conditions and the following disclaimer in the
20 // documentation and/or other materials provided with the distribution.
21 //
22 // 3. Neither the name of the Corporation nor the names of the
23 // contributors may be used to endorse or promote products derived from
24 // this software without specific prior written permission.
25 //
26 // THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
27 // EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
28 // IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
29 // PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
30 // CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
31 // EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
32 // PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
33 // PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
34 // LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
35 // NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
36 // SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
37 //
38 // Questions? Contact Karen Devine (kddevin@sandia.gov)
39 // Erik Boman (egboman@sandia.gov)
40 // Siva Rajamanickam (srajama@sandia.gov)
41 //
42 // ***********************************************************************
43 //
44 // @HEADER
45 
50 #ifndef _ZOLTAN2_PARTITIONINGSOLUTION_HPP_
51 #define _ZOLTAN2_PARTITIONINGSOLUTION_HPP_
52 
53 namespace Zoltan2 {
54 template <typename Adapter>
56 }
57 
58 #include <Zoltan2_Environment.hpp>
59 #include <Zoltan2_Solution.hpp>
60 #include <Zoltan2_GreedyMWM.hpp>
61 #include <Zoltan2_Algorithm.hpp>
63 #include <cmath>
64 #include <algorithm>
65 #include <vector>
66 #include <limits>
67 
68 #ifdef _MSC_VER
69 #define NOMINMAX
70 #include <windows.h>
71 #endif
72 
73 
74 namespace Zoltan2 {
75 
90 template <typename Adapter>
91  class PartitioningSolution : public Solution
92 {
93 public:
94 
95 #ifndef DOXYGEN_SHOULD_SKIP_THIS
96  typedef typename Adapter::gno_t gno_t;
97  typedef typename Adapter::scalar_t scalar_t;
98  typedef typename Adapter::lno_t lno_t;
99  typedef typename Adapter::part_t part_t;
100  typedef typename Adapter::user_t user_t;
101 #endif
102 
120  PartitioningSolution( const RCP<const Environment> &env,
121  const RCP<const Comm<int> > &comm,
122  int nUserWeights,
123  const RCP<Algorithm<Adapter> > &algorithm = Teuchos::null);
124 
154  PartitioningSolution(const RCP<const Environment> &env,
155  const RCP<const Comm<int> > &comm,
156  int nUserWeights, ArrayView<ArrayRCP<part_t> > reqPartIds,
157  ArrayView<ArrayRCP<scalar_t> > reqPartSizes,
158  const RCP<Algorithm<Adapter> > &algorithm = Teuchos::null);
159 
161  // Information that the algorithm may wish to query.
162 
165  size_t getTargetGlobalNumberOfParts() const { return nGlobalParts_; }
166 
169  size_t getActualGlobalNumberOfParts() const { return nGlobalPartsSolution_; }
170 
173  size_t getLocalNumberOfParts() const { return nLocalParts_; }
174 
182  scalar_t getLocalFractionOfPart() const { return localFraction_; }
183 
193  bool oneToOnePartDistribution() const { return onePartPerProc_; }
194 
214  const int *getPartDistribution() const {
215  if (partDist_.size() > 0) return &partDist_[0];
216  else return NULL;
217  }
218 
235  const part_t *getProcDistribution() const {
236  if (procDist_.size() > 0) return &procDist_[0];
237  else return NULL;
238  }
239 
243  int getNumberOfCriteria() const { return nWeightsPerObj_; }
244 
245 
252  bool criteriaHasUniformPartSizes(int idx) const { return pSizeUniform_[idx];}
253 
266  scalar_t getCriteriaPartSize(int idx, part_t part) const {
267  if (pSizeUniform_[idx])
268  return 1.0 / nGlobalParts_;
269  else if (pCompactIndex_[idx].size())
270  return pSize_[idx][pCompactIndex_[idx][part]];
271  else
272  return pSize_[idx][part];
273  }
274 
288  bool criteriaHaveSamePartSizes(int c1, int c2) const;
289 
291  // Method used by the algorithm to set results.
292 
319  void setParts(ArrayRCP<part_t> &partList);
320 
322 
334  void RemapParts();
335 
337  /* Return the weight of objects staying with a given remap.
338  * If remap is NULL, compute weight of objects staying with given partition
339  */
340  long measure_stays(part_t *remap, int *idx, part_t *adj, long *wgt,
341  part_t nrhs, part_t nlhs)
342  {
343  long staying = 0;
344  for (part_t i = 0; i < nrhs; i++) {
345  part_t k = (remap ? remap[i] : i);
346  for (part_t j = idx[k]; j < idx[k+1]; j++) {
347  if (i == (adj[j]-nlhs)) {
348  staying += wgt[j];
349  break;
350  }
351  }
352  }
353  return staying;
354  }
355 
357  // Results that may be queried by the user, by migration methods,
358  // or by metric calculation methods.
359  // We return raw pointers so users don't have to learn about our
360  // pointer wrappers.
361 
364  inline const RCP<const Comm<int> > &getCommunicator() const { return comm_;}
365 
368  inline const RCP<const Environment> &getEnvironment() const { return env_;}
369 
372  const part_t *getPartListView() const {
373  if (parts_.size() > 0) return parts_.getRawPtr();
374  else return NULL;
375  }
376 
381  const int *getProcListView() const {
382  if (procs_.size() > 0) return procs_.getRawPtr();
383  else return NULL;
384  }
385 
386 
389  std::vector<Zoltan2::coordinateModelPartBox<scalar_t, part_t> > &
391  {
392  return this->algorithm_->getPartBoxesView();
393  }
394 
396  // when a point lies on a part boundary, the lowest part
397  // number on that boundary is returned.
398  // Note that not all partitioning algorithms will support
399  // this method.
400  //
401  // \param dim : the number of dimensions specified for the point in space
402  // \param point : the coordinates of the point in space; array of size dim
403  // \return the part number of a part overlapping the given point
404  part_t pointAssign(int dim, scalar_t *point) const
405  {
406  part_t p;
407  try {
408  if (this->algorithm_ == Teuchos::null)
409  throw std::logic_error("no partitioning algorithm has been run yet");
410 
411  p = this->algorithm_->pointAssign(dim, point);
412  }
414  return p;
415  }
416 
418  // This method allocates memory for the return argument, but does not
419  // control that memory. The user is responsible for freeing the
420  // memory.
421  //
422  // \param dim : (in) the number of dimensions specified for the box
423  // \param lower : (in) the coordinates of the lower corner of the box;
424  // array of size dim
425  // \param upper : (in) the coordinates of the upper corner of the box;
426  // array of size dim
427  // \param nPartsFound : (out) the number of parts overlapping the box
428  // \param partsFound : (out) array of parts overlapping the box
429  void boxAssign(int dim, scalar_t *lower, scalar_t *upper,
430  size_t &nPartsFound, part_t **partsFound) const
431  {
432  try {
433  if (this->algorithm_ == Teuchos::null)
434  throw std::logic_error("no partitioning algorithm has been run yet");
435 
436  this->algorithm_->boxAssign(dim, lower, upper, nPartsFound, partsFound);
437  }
439  }
440 
441 
444  void getCommunicationGraph(ArrayRCP <part_t> &comXAdj,
445  ArrayRCP <part_t> &comAdj) const
446  {
447  try {
448  if (this->algorithm_ == Teuchos::null)
449  throw std::logic_error("no partitioning algorithm has been run yet");
450 
451  this->algorithm_->getCommunicationGraph(this, comXAdj, comAdj);
452  }
454  }
455 
473  void getPartsForProc(int procId, double &numParts, part_t &partMin,
474  part_t &partMax) const
475  {
476  env_->localInputAssertion(__FILE__, __LINE__, "invalid process id",
477  procId >= 0 && procId < comm_->getSize(), BASIC_ASSERTION);
478 
479  procToPartsMap(procId, numParts, partMin, partMax);
480  }
481 
493  void getProcsForPart(part_t partId, part_t &procMin, part_t &procMax) const
494  {
495  env_->localInputAssertion(__FILE__, __LINE__, "invalid part id",
496  partId >= 0 && partId < nGlobalParts_, BASIC_ASSERTION);
497 
498  partToProcsMap(partId, procMin, procMax);
499  }
500 
501 private:
502  void partToProc(bool doCheck, bool haveNumLocalParts, bool haveNumGlobalParts,
503  int numLocalParts, int numGlobalParts);
504 
505  void procToPartsMap(int procId, double &numParts, part_t &partMin,
506  part_t &partMax) const;
507 
508  void partToProcsMap(part_t partId, int &procMin, int &procMax) const;
509 
510  void setPartDistribution();
511 
512  void setPartSizes(ArrayView<ArrayRCP<part_t> > reqPartIds,
513  ArrayView<ArrayRCP<scalar_t> > reqPartSizes);
514 
515  void computePartSizes(int widx, ArrayView<part_t> ids,
516  ArrayView<scalar_t> sizes);
517 
518  void broadcastPartSizes(int widx);
519 
520 
521  RCP<const Environment> env_; // has application communicator
522  const RCP<const Comm<int> > comm_; // the problem communicator
523 
524  //part box boundaries as a result of geometric partitioning algorithm.
525  RCP < std::vector <Zoltan2::coordinateModelPartBox <scalar_t, part_t> > > partBoxes;
526 
527  part_t nGlobalParts_;// target global number of parts
528  part_t nLocalParts_; // number of parts to be on this process
529 
530  scalar_t localFraction_; // approx fraction of a part on this process
531  int nWeightsPerObj_; // if user has no weights, this is 1 TODO: WHY???
532 
533  // If process p is to be assigned part p for all p, then onePartPerProc_
534  // is true. Otherwise it is false, and either procDist_ or partDist_
535  // describes the allocation of parts to processes.
536  //
537  // If parts are never split across processes, then procDist_ is defined
538  // as follows:
539  //
540  // partId = procDist_[procId]
541  // partIdNext = procDist_[procId+1]
542  // globalNumberOfParts = procDist_[numProcs]
543  //
544  // meaning that the parts assigned to process procId range from
545  // [partId, partIdNext). If partIdNext is the same as partId, then
546  // process procId has no parts.
547  //
548  // If the number parts is less than the number of processes, and the
549  // user did not specify "num_local_parts" for each of the processes, then
550  // parts are split across processes, and partDist_ is defined rather than
551  // procDist_.
552  //
553  // procId = partDist_[partId]
554  // procIdNext = partDist_[partId+1]
555  // globalNumberOfProcs = partDist_[numParts]
556  //
557  // which implies that the part partId is shared by processes in the
558  // the range [procId, procIdNext).
559  //
560  // We use std::vector so we can use upper_bound algorithm
561 
562  bool onePartPerProc_; // either this is true...
563  std::vector<int> partDist_; // or this is defined ...
564  std::vector<part_t> procDist_; // or this is defined.
565  bool procDistEquallySpread_; // if procDist_ is used and
566  // #parts > #procs and
567  // num_local_parts is not specified,
568  // parts are evenly distributed to procs
569 
570  // In order to minimize the storage required for part sizes, we
571  // have three different representations.
572  //
573  // If the part sizes for weight index w are all the same, then:
574  // pSizeUniform_[w] = true
575  // pCompactIndex_[w].size() = 0
576  // pSize_[w].size() = 0
577  //
578  // and the size for part p is 1.0 / nparts.
579  //
580  // If part sizes differ for each part in weight index w, but there
581  // are no more than 64 distinct sizes:
582  // pSizeUniform_[w] = false
583  // pCompactIndex_[w].size() = number of parts
584  // pSize_[w].size() = number of different sizes
585  //
586  // and the size for part p is pSize_[pCompactIndex_[p]].
587  //
588  // If part sizes differ for each part in weight index w, and there
589  // are more than 64 distinct sizes:
590  // pSizeUniform_[w] = false
591  // pCompactIndex_[w].size() = 0
592  // pSize_[w].size() = nparts
593  //
594  // and the size for part p is pSize_[p].
595  //
596  // NOTE: If we expect to have similar cases, i.e. a long list of scalars
597  // where it is highly possible that just a few unique values appear,
598  // then we may want to make this a class. The advantage is that we
599  // save a long list of 1-byte indices instead of a long list of scalars.
600 
601  ArrayRCP<bool> pSizeUniform_;
602  ArrayRCP<ArrayRCP<unsigned char> > pCompactIndex_;
603  ArrayRCP<ArrayRCP<scalar_t> > pSize_;
604 
606  // The algorithm sets these values upon completion.
607 
608  ArrayRCP<part_t> parts_; // part number assigned to localid[i]
609 
610  bool haveSolution_;
611 
612  part_t nGlobalPartsSolution_; // global number of parts in solution
613 
615  // The solution calculates this from the part assignments,
616  // unless onePartPerProc_.
617 
618  ArrayRCP<int> procs_; // process rank assigned to localid[i]
619 
621  // Algorithm used to compute the solution;
622  // needed for post-processing with pointAssign or getCommunicationGraph
623  const RCP<Algorithm<Adapter> > algorithm_; //
624 };
625 
627 // Definitions
629 
630 template <typename Adapter>
632  const RCP<const Environment> &env,
633  const RCP<const Comm<int> > &comm,
634  int nUserWeights,
635  const RCP<Algorithm<Adapter> > &algorithm)
636  : env_(env), comm_(comm),
637  partBoxes(),
638  nGlobalParts_(0), nLocalParts_(0),
639  localFraction_(0), nWeightsPerObj_(),
640  onePartPerProc_(false), partDist_(), procDist_(),
641  procDistEquallySpread_(false),
642  pSizeUniform_(), pCompactIndex_(), pSize_(),
643  parts_(), haveSolution_(false), nGlobalPartsSolution_(0),
644  procs_(), algorithm_(algorithm)
645 {
646  nWeightsPerObj_ = (nUserWeights ? nUserWeights : 1); // TODO: WHY?? WHY NOT ZERO?
647 
648  setPartDistribution();
649 
650  // We must call setPartSizes() because part sizes may have
651  // been provided by the user on other processes.
652 
653  ArrayRCP<part_t> *noIds = new ArrayRCP<part_t> [nWeightsPerObj_];
654  ArrayRCP<scalar_t> *noSizes = new ArrayRCP<scalar_t> [nWeightsPerObj_];
655  ArrayRCP<ArrayRCP<part_t> > ids(noIds, 0, nWeightsPerObj_, true);
656  ArrayRCP<ArrayRCP<scalar_t> > sizes(noSizes, 0, nWeightsPerObj_, true);
657 
658  setPartSizes(ids.view(0, nWeightsPerObj_), sizes.view(0, nWeightsPerObj_));
659 
660  env_->memory("After construction of solution");
661 }
662 
663 template <typename Adapter>
665  const RCP<const Environment> &env,
666  const RCP<const Comm<int> > &comm,
667  int nUserWeights,
668  ArrayView<ArrayRCP<part_t> > reqPartIds,
669  ArrayView<ArrayRCP<scalar_t> > reqPartSizes,
670  const RCP<Algorithm<Adapter> > &algorithm)
671  : env_(env), comm_(comm),
672  partBoxes(),
673  nGlobalParts_(0), nLocalParts_(0),
674  localFraction_(0), nWeightsPerObj_(),
675  onePartPerProc_(false), partDist_(), procDist_(),
676  procDistEquallySpread_(false),
677  pSizeUniform_(), pCompactIndex_(), pSize_(),
678  parts_(), haveSolution_(false), nGlobalPartsSolution_(0),
679  procs_(), algorithm_(algorithm)
680 {
681  nWeightsPerObj_ = (nUserWeights ? nUserWeights : 1); // TODO: WHY?? WHY NOT ZERO?
682 
683  setPartDistribution();
684 
685  setPartSizes(reqPartIds, reqPartSizes);
686 
687  env_->memory("After construction of solution");
688 }
689 
690 template <typename Adapter>
692 {
693  // Did the caller define num_global_parts and/or num_local_parts?
694 
695  const ParameterList &pl = env_->getParameters();
696  size_t haveGlobalNumParts=0, haveLocalNumParts=0;
697  int numLocal=0, numGlobal=0;
698 
699  const Teuchos::ParameterEntry *pe = pl.getEntryPtr("num_global_parts");
700 
701  if (pe){
702  haveGlobalNumParts = 1;
703  nGlobalParts_ = part_t(pe->getValue(&nGlobalParts_));
704  numGlobal = nGlobalParts_;
705  }
706 
707  pe = pl.getEntryPtr("num_local_parts");
708 
709  if (pe){
710  haveLocalNumParts = 1;
711  nLocalParts_ = part_t(pe->getValue(&nLocalParts_));
712  numLocal = nLocalParts_;
713  }
714 
715  try{
716  // Sets onePartPerProc_, partDist_, and procDist_
717 
718  partToProc(true, haveLocalNumParts, haveGlobalNumParts,
719  numLocal, numGlobal);
720  }
722 
723  int nprocs = comm_->getSize();
724  int rank = comm_->getRank();
725 
726  if (onePartPerProc_){
727  nGlobalParts_ = nprocs;
728  nLocalParts_ = 1;
729  }
730  else if (partDist_.size() > 0){ // more procs than parts
731  nGlobalParts_ = partDist_.size() - 1;
732  int pstart = partDist_[0];
733  for (part_t i=1; i <= nGlobalParts_; i++){
734  int pend = partDist_[i];
735  if (rank >= pstart && rank < pend){
736  int numOwners = pend - pstart;
737  nLocalParts_ = 1;
738  localFraction_ = 1.0 / numOwners;
739  break;
740  }
741  pstart = pend;
742  }
743  }
744  else if (procDist_.size() > 0){ // more parts than procs
745  nGlobalParts_ = procDist_[nprocs];
746  nLocalParts_ = procDist_[rank+1] - procDist_[rank];
747  }
748  else {
749  throw std::logic_error("partToProc error");
750  }
751 
752 }
753 
754 template <typename Adapter>
755  void PartitioningSolution<Adapter>::setPartSizes(
756  ArrayView<ArrayRCP<part_t> > ids, ArrayView<ArrayRCP<scalar_t> > sizes)
757 {
758  int widx = nWeightsPerObj_;
759  bool fail=false;
760 
761  size_t *countBuf = new size_t [widx*2];
762  ArrayRCP<size_t> counts(countBuf, 0, widx*2, true);
763 
764  fail = ((ids.size() != widx) || (sizes.size() != widx));
765 
766  for (int w=0; !fail && w < widx; w++){
767  counts[w] = ids[w].size();
768  if (ids[w].size() != sizes[w].size()) fail=true;
769  }
770 
771  env_->globalBugAssertion(__FILE__, __LINE__, "bad argument arrays", fail==0,
772  COMPLEX_ASSERTION, comm_);
773 
774  // Are all part sizes the same? This is the common case.
775 
776  ArrayRCP<scalar_t> *emptySizes= new ArrayRCP<scalar_t> [widx];
777  pSize_ = arcp(emptySizes, 0, widx);
778 
779  ArrayRCP<unsigned char> *emptyIndices= new ArrayRCP<unsigned char> [widx];
780  pCompactIndex_ = arcp(emptyIndices, 0, widx);
781 
782  bool *info = new bool [widx];
783  pSizeUniform_ = arcp(info, 0, widx);
784  for (int w=0; w < widx; w++)
785  pSizeUniform_[w] = true;
786 
787  if (nGlobalParts_ == 1){
788  return; // there's only one part in the whole problem
789  }
790 
791  size_t *ptr1 = counts.getRawPtr();
792  size_t *ptr2 = counts.getRawPtr() + widx;
793 
794  try{
795  reduceAll<int, size_t>(*comm_, Teuchos::REDUCE_MAX, widx, ptr1, ptr2);
796  }
797  Z2_THROW_OUTSIDE_ERROR(*env_);
798 
799  bool zero = true;
800 
801  for (int w=0; w < widx; w++)
802  if (counts[widx+w] > 0){
803  zero = false;
804  pSizeUniform_[w] = false;
805  }
806 
807  if (zero) // Part sizes for all criteria are uniform.
808  return;
809 
810  // Compute the part sizes for criteria for which part sizes were
811  // supplied. Normalize for each criteria so part sizes sum to one.
812 
813  int nprocs = comm_->getSize();
814  int rank = comm_->getRank();
815 
816  for (int w=0; w < widx; w++){
817  if (pSizeUniform_[w]) continue;
818 
819  // Send all ids and sizes to one process.
820  // (There is no simple gather method in Teuchos.)
821 
822  part_t length = ids[w].size();
823  part_t *allLength = new part_t [nprocs];
824  Teuchos::gatherAll<int, part_t>(*comm_, 1, &length,
825  nprocs, allLength);
826 
827  if (rank == 0){
828  int total = 0;
829  for (int i=0; i < nprocs; i++)
830  total += allLength[i];
831 
832  part_t *partNums = new part_t [total];
833  scalar_t *partSizes = new scalar_t [total];
834 
835  ArrayView<part_t> idArray(partNums, total);
836  ArrayView<scalar_t> sizeArray(partSizes, total);
837 
838  if (length > 0){
839  for (int i=0; i < length; i++){
840  *partNums++ = ids[w][i];
841  *partSizes++ = sizes[w][i];
842  }
843  }
844 
845  for (int p=1; p < nprocs; p++){
846  if (allLength[p] > 0){
847  Teuchos::receive<int, part_t>(*comm_, p,
848  allLength[p], partNums);
849  Teuchos::receive<int, scalar_t>(*comm_, p,
850  allLength[p], partSizes);
851  partNums += allLength[p];
852  partSizes += allLength[p];
853  }
854  }
855 
856  delete [] allLength;
857 
858  try{
859  computePartSizes(w, idArray, sizeArray);
860  }
862 
863  delete [] idArray.getRawPtr();
864  delete [] sizeArray.getRawPtr();
865  }
866  else{
867  delete [] allLength;
868  if (length > 0){
869  Teuchos::send<int, part_t>(*comm_, length, ids[w].getRawPtr(), 0);
870  Teuchos::send<int, scalar_t>(*comm_, length, sizes[w].getRawPtr(), 0);
871  }
872  }
873 
874  broadcastPartSizes(w);
875  }
876 }
877 
878 template <typename Adapter>
879  void PartitioningSolution<Adapter>::broadcastPartSizes(int widx)
880 {
881  env_->localBugAssertion(__FILE__, __LINE__, "preallocations",
882  pSize_.size()>widx &&
883  pSizeUniform_.size()>widx && pCompactIndex_.size()>widx,
885 
886  int rank = comm_->getRank();
887  int nprocs = comm_->getSize();
888  part_t nparts = nGlobalParts_;
889 
890  if (nprocs < 2)
891  return;
892 
893  char flag=0;
894 
895  if (rank == 0){
896  if (pSizeUniform_[widx] == true)
897  flag = 1;
898  else if (pCompactIndex_[widx].size() > 0)
899  flag = 2;
900  else
901  flag = 3;
902  }
903 
904  try{
905  Teuchos::broadcast<int, char>(*comm_, 0, 1, &flag);
906  }
907  Z2_THROW_OUTSIDE_ERROR(*env_);
908 
909  if (flag == 1){
910  if (rank > 0)
911  pSizeUniform_[widx] = true;
912 
913  return;
914  }
915 
916  if (flag == 2){
917 
918  // broadcast the indices into the size list
919 
920  unsigned char *idxbuf = NULL;
921 
922  if (rank > 0){
923  idxbuf = new unsigned char [nparts];
924  env_->localMemoryAssertion(__FILE__, __LINE__, nparts, idxbuf);
925  }
926  else{
927  env_->localBugAssertion(__FILE__, __LINE__, "index list size",
928  pCompactIndex_[widx].size() == nparts, COMPLEX_ASSERTION);
929  idxbuf = pCompactIndex_[widx].getRawPtr();
930  }
931 
932  try{
933  // broadcast of unsigned char is not supported
934  Teuchos::broadcast<int, char>(*comm_, 0, nparts,
935  reinterpret_cast<char *>(idxbuf));
936  }
937  Z2_THROW_OUTSIDE_ERROR(*env_);
938 
939  if (rank > 0)
940  pCompactIndex_[widx] = arcp(idxbuf, 0, nparts, true);
941 
942  // broadcast the list of different part sizes
943 
944  unsigned char maxIdx=0;
945  for (part_t p=0; p < nparts; p++)
946  if (idxbuf[p] > maxIdx) maxIdx = idxbuf[p];
947 
948  int numSizes = maxIdx + 1;
949 
950  scalar_t *sizeList = NULL;
951 
952  if (rank > 0){
953  sizeList = new scalar_t [numSizes];
954  env_->localMemoryAssertion(__FILE__, __LINE__, numSizes, sizeList);
955  }
956  else{
957  env_->localBugAssertion(__FILE__, __LINE__, "wrong number of sizes",
958  numSizes == pSize_[widx].size(), COMPLEX_ASSERTION);
959 
960  sizeList = pSize_[widx].getRawPtr();
961  }
962 
963  try{
964  Teuchos::broadcast<int, scalar_t>(*comm_, 0, numSizes, sizeList);
965  }
966  Z2_THROW_OUTSIDE_ERROR(*env_);
967 
968  if (rank > 0)
969  pSize_[widx] = arcp(sizeList, 0, numSizes, true);
970 
971  return;
972  }
973 
974  if (flag == 3){
975 
976  // broadcast the size of each part
977 
978  scalar_t *sizeList = NULL;
979 
980  if (rank > 0){
981  sizeList = new scalar_t [nparts];
982  env_->localMemoryAssertion(__FILE__, __LINE__, nparts, sizeList);
983  }
984  else{
985  env_->localBugAssertion(__FILE__, __LINE__, "wrong number of sizes",
986  nparts == pSize_[widx].size(), COMPLEX_ASSERTION);
987 
988  sizeList = pSize_[widx].getRawPtr();
989  }
990 
991  try{
992  Teuchos::broadcast<int, scalar_t >(*comm_, 0, nparts, sizeList);
993  }
994  Z2_THROW_OUTSIDE_ERROR(*env_);
995 
996  if (rank > 0)
997  pSize_[widx] = arcp(sizeList, 0, nparts);
998 
999  return;
1000  }
1001 }
1002 
1003 template <typename Adapter>
1004  void PartitioningSolution<Adapter>::computePartSizes(int widx,
1005  ArrayView<part_t> ids, ArrayView<scalar_t> sizes)
1006 {
1007  int len = ids.size();
1008 
1009  if (len == 0){
1010  pSizeUniform_[widx] = true;
1011  return;
1012  }
1013 
1014  env_->localBugAssertion(__FILE__, __LINE__, "bad array sizes",
1015  len>0 && sizes.size()==len, COMPLEX_ASSERTION);
1016 
1017  env_->localBugAssertion(__FILE__, __LINE__, "bad index",
1018  widx>=0 && widx<nWeightsPerObj_, COMPLEX_ASSERTION);
1019 
1020  env_->localBugAssertion(__FILE__, __LINE__, "preallocations",
1021  pSize_.size()>widx &&
1022  pSizeUniform_.size()>widx && pCompactIndex_.size()>widx,
1024 
1025  // Check ids and sizes and find min, max and average sizes.
1026  // If sizes are very close to uniform, call them uniform parts.
1027 
1028  part_t nparts = nGlobalParts_;
1029  unsigned char *buf = new unsigned char [nparts];
1030  env_->localMemoryAssertion(__FILE__, __LINE__, nparts, buf);
1031  memset(buf, 0, nparts);
1032  ArrayRCP<unsigned char> partIdx(buf, 0, nparts, true);
1033 
1034  scalar_t epsilon = 10e-5 / nparts;
1035  scalar_t min=sizes[0], max=sizes[0], sum=0;
1036 
1037  for (int i=0; i < len; i++){
1038  part_t id = ids[i];
1039  scalar_t size = sizes[i];
1040 
1041  env_->localInputAssertion(__FILE__, __LINE__, "invalid part id",
1042  id>=0 && id<nparts, BASIC_ASSERTION);
1043 
1044  env_->localInputAssertion(__FILE__, __LINE__, "invalid part size", size>=0,
1045  BASIC_ASSERTION);
1046 
1047  // TODO: we could allow users to specify multiple sizes for the same
1048  // part if we add a parameter that says what we are to do with them:
1049  // add them or take the max.
1050 
1051  env_->localInputAssertion(__FILE__, __LINE__,
1052  "multiple sizes provided for one part", partIdx[id]==0, BASIC_ASSERTION);
1053 
1054  partIdx[id] = 1; // mark that we have a size for this part
1055 
1056  if (size < min) min = size;
1057  if (size > max) max = size;
1058  sum += size;
1059  }
1060 
1061  if (sum == 0){
1062 
1063  // User has given us a list of parts of size 0, we'll set
1064  // the rest to them to equally sized parts.
1065 
1066  scalar_t *allSizes = new scalar_t [2];
1067  env_->localMemoryAssertion(__FILE__, __LINE__, 2, allSizes);
1068 
1069  ArrayRCP<scalar_t> sizeArray(allSizes, 0, 2, true);
1070 
1071  int numNonZero = nparts - len;
1072 
1073  allSizes[0] = 0.0;
1074  allSizes[1] = 1.0 / numNonZero;
1075 
1076  for (part_t p=0; p < nparts; p++)
1077  buf[p] = 1; // index to default part size
1078 
1079  for (int i=0; i < len; i++)
1080  buf[ids[i]] = 0; // index to part size zero
1081 
1082  pSize_[widx] = sizeArray;
1083  pCompactIndex_[widx] = partIdx;
1084 
1085  return;
1086  }
1087 
1088  if (max - min <= epsilon){
1089  pSizeUniform_[widx] = true;
1090  return;
1091  }
1092 
1093  // A size for parts that were not specified:
1094  scalar_t avg = sum / nparts;
1095 
1096  // We are going to merge part sizes that are very close. This takes
1097  // computation time now, but can save considerably in the storage of
1098  // all part sizes on each process. For example, a common case may
1099  // be some parts are size 1 and all the rest are size 2.
1100 
1101  scalar_t *tmp = new scalar_t [len];
1102  env_->localMemoryAssertion(__FILE__, __LINE__, len, tmp);
1103  memcpy(tmp, sizes.getRawPtr(), sizeof(scalar_t) * len);
1104  ArrayRCP<scalar_t> partSizes(tmp, 0, len, true);
1105 
1106  std::sort(partSizes.begin(), partSizes.end());
1107 
1108  // create a list of sizes that are unique within epsilon
1109 
1110  Array<scalar_t> nextUniqueSize;
1111  nextUniqueSize.push_back(partSizes[len-1]); // largest
1112  scalar_t curr = partSizes[len-1];
1113  int avgIndex = len;
1114  bool haveAvg = false;
1115  if (curr - avg <= epsilon)
1116  avgIndex = 0;
1117 
1118  for (int i=len-2; i >= 0; i--){
1119  scalar_t val = partSizes[i];
1120  if (curr - val > epsilon){
1121  nextUniqueSize.push_back(val); // the highest in the group
1122  curr = val;
1123  if (avgIndex==len && val > avg && val - avg <= epsilon){
1124  // the average would be in this group
1125  avgIndex = nextUniqueSize.size() - 1;
1126  haveAvg = true;
1127  }
1128  }
1129  }
1130 
1131  partSizes.clear();
1132 
1133  size_t numSizes = nextUniqueSize.size();
1134  int sizeArrayLen = numSizes;
1135 
1136  if (numSizes < 64){
1137  // We can store the size for every part in a compact way.
1138 
1139  // Create a list of all sizes in increasing order
1140 
1141  if (!haveAvg) sizeArrayLen++; // need to include average
1142 
1143  scalar_t *allSizes = new scalar_t [sizeArrayLen];
1144  env_->localMemoryAssertion(__FILE__, __LINE__, sizeArrayLen, allSizes);
1145  ArrayRCP<scalar_t> sizeArray(allSizes, 0, sizeArrayLen, true);
1146 
1147  int newAvgIndex = sizeArrayLen;
1148 
1149  for (int i=numSizes-1, idx=0; i >= 0; i--){
1150 
1151  if (newAvgIndex == sizeArrayLen){
1152 
1153  if (haveAvg && i==avgIndex)
1154  newAvgIndex = idx;
1155 
1156  else if (!haveAvg && avg < nextUniqueSize[i]){
1157  newAvgIndex = idx;
1158  allSizes[idx++] = avg;
1159  }
1160  }
1161 
1162  allSizes[idx++] = nextUniqueSize[i];
1163  }
1164 
1165  env_->localBugAssertion(__FILE__, __LINE__, "finding average in list",
1166  newAvgIndex < sizeArrayLen, COMPLEX_ASSERTION);
1167 
1168  for (int i=0; i < nparts; i++){
1169  buf[i] = newAvgIndex; // index to default part size
1170  }
1171 
1172  sum = (nparts - len) * allSizes[newAvgIndex];
1173 
1174  for (int i=0; i < len; i++){
1175  int id = ids[i];
1176  scalar_t size = sizes[i];
1177  int index;
1178 
1179  // Find the first size greater than or equal to this size.
1180 
1181  if (size < avg && avg - size <= epsilon)
1182  index = newAvgIndex;
1183  else{
1184  typename ArrayRCP<scalar_t>::iterator found =
1185  std::lower_bound(sizeArray.begin(), sizeArray.end(), size);
1186 
1187  env_->localBugAssertion(__FILE__, __LINE__, "size array",
1188  found != sizeArray.end(), COMPLEX_ASSERTION);
1189 
1190  index = found - sizeArray.begin();
1191  }
1192 
1193  buf[id] = index;
1194  sum += allSizes[index];
1195  }
1196 
1197  for (int i=0; i < sizeArrayLen; i++){
1198  sizeArray[i] /= sum;
1199  }
1200 
1201  pCompactIndex_[widx] = partIdx;
1202  pSize_[widx] = sizeArray;
1203  }
1204  else{
1205  // To have access to part sizes, we must store nparts scalar_ts on
1206  // every process. We expect this is a rare case.
1207 
1208  tmp = new scalar_t [nparts];
1209  env_->localMemoryAssertion(__FILE__, __LINE__, nparts, tmp);
1210 
1211  sum += ((nparts - len) * avg);
1212 
1213  for (int i=0; i < nparts; i++){
1214  tmp[i] = avg/sum;
1215  }
1216 
1217  for (int i=0; i < len; i++){
1218  tmp[ids[i]] = sizes[i]/sum;
1219  }
1220 
1221  pSize_[widx] = arcp(tmp, 0, nparts);
1222  }
1223 }
1224 
1225 template <typename Adapter>
1226  void PartitioningSolution<Adapter>::setParts(ArrayRCP<part_t> &partList)
1227 {
1228  env_->debug(DETAILED_STATUS, "Entering setParts");
1229 
1230  size_t len = partList.size();
1231 
1232  // Find the actual number of parts in the solution, which can
1233  // be more or less than the nGlobalParts_ target.
1234  // (We may want to compute the imbalance of a given solution with
1235  // respect to a desired solution. This solution may have more or
1236  // fewer parts that the desired solution.)
1237 
1238  part_t lMax = std::numeric_limits<part_t>::min();
1239  part_t lMin = std::numeric_limits<part_t>::max();
1240  part_t gMax, gMin;
1241 
1242  for (size_t i = 0; i < len; i++) {
1243  if (partList[i] < lMin) lMin = partList[i];
1244  if (partList[i] > lMax) lMax = partList[i];
1245  }
1246  Teuchos::reduceAll<int, part_t>(*comm_, Teuchos::REDUCE_MIN, 1, &lMin, &gMin);
1247  Teuchos::reduceAll<int, part_t>(*comm_, Teuchos::REDUCE_MAX, 1, &lMax, &gMax);
1248 
1249  nGlobalPartsSolution_ = gMax - gMin + 1;
1250  parts_ = partList;
1251 
1252  // Now determine which process gets each object, if not one-to-one.
1253 
1254  if (!onePartPerProc_){
1255 
1256  int *procs = new int [len];
1257  env_->localMemoryAssertion(__FILE__, __LINE__, len, procs);
1258  procs_ = arcp<int>(procs, 0, len);
1259 
1260  if (len > 0) {
1261  part_t *parts = partList.getRawPtr();
1262 
1263  if (procDist_.size() > 0){ // parts are not split across procs
1264 
1265  int procId;
1266  for (size_t i=0; i < len; i++){
1267  partToProcsMap(parts[i], procs[i], procId);
1268  }
1269  }
1270  else{ // harder - we need to split the parts across multiple procs
1271 
1272  lno_t *partCounter = new lno_t [nGlobalPartsSolution_];
1273  env_->localMemoryAssertion(__FILE__, __LINE__, nGlobalPartsSolution_,
1274  partCounter);
1275 
1276  int numProcs = comm_->getSize();
1277 
1278  //MD NOTE: there was no initialization for partCounter.
1279  //I added the line below, correct me if I am wrong.
1280  memset(partCounter, 0, sizeof(lno_t) * nGlobalPartsSolution_);
1281 
1282  for (typename ArrayRCP<part_t>::size_type i=0; i < partList.size(); i++)
1283  partCounter[parts[i]]++;
1284 
1285  lno_t *procCounter = new lno_t [numProcs];
1286  env_->localMemoryAssertion(__FILE__, __LINE__, numProcs, procCounter);
1287 
1288  int proc1;
1289  int proc2 = partDist_[0];
1290 
1291  for (part_t part=1; part < nGlobalParts_; part++){
1292  proc1 = proc2;
1293  proc2 = partDist_[part+1];
1294  int numprocs = proc2 - proc1;
1295 
1296  double dNum = partCounter[part];
1297  double dProcs = numprocs;
1298 
1299  //cout << "dNum:" << dNum << " dProcs:" << dProcs << endl;
1300  double each = floor(dNum/dProcs);
1301  double extra = fmod(dNum,dProcs);
1302 
1303  for (int proc=proc1, i=0; proc<proc2; proc++, i++){
1304  if (i < extra)
1305  procCounter[proc] = lno_t(each) + 1;
1306  else
1307  procCounter[proc] = lno_t(each);
1308  }
1309  }
1310 
1311  delete [] partCounter;
1312 
1313  for (typename ArrayRCP<part_t>::size_type i=0; i < partList.size(); i++){
1314  if (partList[i] >= nGlobalParts_){
1315  // Solution has more parts that targeted. These
1316  // objects just remain on this process.
1317  procs[i] = comm_->getRank();
1318  continue;
1319  }
1320  part_t partNum = parts[i];
1321  proc1 = partDist_[partNum];
1322  proc2 = partDist_[partNum + 1];
1323 
1324  int proc;
1325  for (proc=proc1; proc < proc2; proc++){
1326  if (procCounter[proc] > 0){
1327  procs[i] = proc;
1328  procCounter[proc]--;
1329  break;
1330  }
1331  }
1332  env_->localBugAssertion(__FILE__, __LINE__, "part to proc",
1333  proc < proc2, COMPLEX_ASSERTION);
1334  }
1335 
1336  delete [] procCounter;
1337  }
1338  }
1339  }
1340 
1341  // Now that parts_ info is back on home process, remap the parts.
1342  // TODO: The parts will be inconsistent with the proc assignments after
1343  // TODO: remapping. This problem will go away after we separate process
1344  // TODO: mapping from setParts. But for MueLu's use case, the part
1345  // TODO: remapping is all that matters; they do not use the process mapping.
1346  int doRemap = 0;
1347  const Teuchos::ParameterEntry *pe =
1348  env_->getParameters().getEntryPtr("remap_parts");
1349  if (pe) doRemap = pe->getValue(&doRemap);
1350  if (doRemap) RemapParts();
1351 
1352  haveSolution_ = true;
1353 
1354  env_->memory("After Solution has processed algorithm's answer");
1355  env_->debug(DETAILED_STATUS, "Exiting setParts");
1356 }
1357 
1358 
1359 template <typename Adapter>
1361  double &numParts, part_t &partMin, part_t &partMax) const
1362 {
1363  if (onePartPerProc_){
1364  numParts = 1.0;
1365  partMin = partMax = procId;
1366  }
1367  else if (procDist_.size() > 0){
1368  partMin = procDist_[procId];
1369  partMax = procDist_[procId+1] - 1;
1370  numParts = procDist_[procId+1] - partMin;
1371  }
1372  else{
1373  // find the first p such that partDist_[p] > procId
1374 
1375  std::vector<int>::const_iterator entry;
1376  entry = std::upper_bound(partDist_.begin(), partDist_.end(), procId);
1377 
1378  size_t partIdx = entry - partDist_.begin();
1379  int numProcs = partDist_[partIdx] - partDist_[partIdx-1];
1380  partMin = partMax = int(partIdx) - 1;
1381  numParts = 1.0 / numProcs;
1382  }
1383 }
1384 
1385 template <typename Adapter>
1386  void PartitioningSolution<Adapter>::partToProcsMap(part_t partId,
1387  int &procMin, int &procMax) const
1388 {
1389  if (partId >= nGlobalParts_){
1390  // setParts() may be given an initial solution which uses a
1391  // different number of parts than the desired solution. It is
1392  // still a solution. We keep it on this process.
1393  procMin = procMax = comm_->getRank();
1394  }
1395  else if (onePartPerProc_){
1396  procMin = procMax = int(partId);
1397  }
1398  else if (procDist_.size() > 0){
1399  if (procDistEquallySpread_) {
1400  // Avoid binary search.
1401  double fProcs = comm_->getSize();
1402  double fParts = nGlobalParts_;
1403  double each = fParts / fProcs;
1404  procMin = int(partId / each);
1405  while (procDist_[procMin] > partId) procMin--;
1406  while (procDist_[procMin+1] <= partId) procMin++;
1407  procMax = procMin;
1408  }
1409  else {
1410  // find the first p such that procDist_[p] > partId.
1411  // For now, do a binary search.
1412 
1413  typename std::vector<part_t>::const_iterator entry;
1414  entry = std::upper_bound(procDist_.begin(), procDist_.end(), partId);
1415 
1416  size_t procIdx = entry - procDist_.begin();
1417  procMin = procMax = int(procIdx) - 1;
1418  }
1419  }
1420  else{
1421  procMin = partDist_[partId];
1422  procMax = partDist_[partId+1] - 1;
1423  }
1424 }
1425 
1426 template <typename Adapter>
1428  int c1, int c2) const
1429 {
1430  if (c1 < 0 || c1 >= nWeightsPerObj_ || c2 < 0 || c2 >= nWeightsPerObj_ )
1431  throw std::logic_error("criteriaHaveSamePartSizes error");
1432 
1433  bool theSame = false;
1434 
1435  if (c1 == c2)
1436  theSame = true;
1437 
1438  else if (pSizeUniform_[c1] == true && pSizeUniform_[c2] == true)
1439  theSame = true;
1440 
1441  else if (pCompactIndex_[c1].size() == pCompactIndex_[c2].size()){
1442  theSame = true;
1443  bool useIndex = pCompactIndex_[c1].size() > 0;
1444  if (useIndex){
1445  for (part_t p=0; theSame && p < nGlobalParts_; p++)
1446  if (pSize_[c1][pCompactIndex_[c1][p]] !=
1447  pSize_[c2][pCompactIndex_[c2][p]])
1448  theSame = false;
1449  }
1450  else{
1451  for (part_t p=0; theSame && p < nGlobalParts_; p++)
1452  if (pSize_[c1][p] != pSize_[c2][p])
1453  theSame = false;
1454  }
1455  }
1456 
1457  return theSame;
1458 }
1459 
1475 template <typename Adapter>
1477  bool doCheck, bool haveNumLocalParts, bool haveNumGlobalParts,
1478  int numLocalParts, int numGlobalParts)
1479 {
1480 #ifdef _MSC_VER
1481  typedef SSIZE_T ssize_t;
1482 #endif
1483  int nprocs = comm_->getSize();
1484  ssize_t reducevals[4];
1485  ssize_t sumHaveGlobal=0, sumHaveLocal=0;
1486  ssize_t sumGlobal=0, sumLocal=0;
1487  ssize_t maxGlobal=0, maxLocal=0;
1488  ssize_t vals[4] = {haveNumGlobalParts, haveNumLocalParts,
1489  numGlobalParts, numLocalParts};
1490 
1491  partDist_.clear();
1492  procDist_.clear();
1493 
1494  if (doCheck){
1495 
1496  try{
1497  reduceAll<int, ssize_t>(*comm_, Teuchos::REDUCE_SUM, 4, vals, reducevals);
1498  }
1499  Z2_THROW_OUTSIDE_ERROR(*env_);
1500 
1501  sumHaveGlobal = reducevals[0];
1502  sumHaveLocal = reducevals[1];
1503  sumGlobal = reducevals[2];
1504  sumLocal = reducevals[3];
1505 
1506  env_->localInputAssertion(__FILE__, __LINE__,
1507  "Either all procs specify num_global/local_parts or none do",
1508  (sumHaveGlobal == 0 || sumHaveGlobal == nprocs) &&
1509  (sumHaveLocal == 0 || sumHaveLocal == nprocs),
1510  BASIC_ASSERTION);
1511  }
1512  else{
1513  if (haveNumLocalParts)
1514  sumLocal = numLocalParts * nprocs;
1515  if (haveNumGlobalParts)
1516  sumGlobal = numGlobalParts * nprocs;
1517 
1518  sumHaveGlobal = haveNumGlobalParts ? nprocs : 0;
1519  sumHaveLocal = haveNumLocalParts ? nprocs : 0;
1520 
1521  maxLocal = numLocalParts;
1522  maxGlobal = numGlobalParts;
1523  }
1524 
1525  if (!haveNumLocalParts && !haveNumGlobalParts){
1526  onePartPerProc_ = true; // default if user did not specify
1527  return;
1528  }
1529 
1530  if (haveNumGlobalParts){
1531  if (doCheck){
1532  vals[0] = numGlobalParts;
1533  vals[1] = numLocalParts;
1534  try{
1535  reduceAll<int, ssize_t>(
1536  *comm_, Teuchos::REDUCE_MAX, 2, vals, reducevals);
1537  }
1538  Z2_THROW_OUTSIDE_ERROR(*env_);
1539 
1540  maxGlobal = reducevals[0];
1541  maxLocal = reducevals[1];
1542 
1543  env_->localInputAssertion(__FILE__, __LINE__,
1544  "Value for num_global_parts is different on different processes.",
1545  maxGlobal * nprocs == sumGlobal, BASIC_ASSERTION);
1546  }
1547 
1548  if (sumLocal){
1549  env_->localInputAssertion(__FILE__, __LINE__,
1550  "Sum of num_local_parts does not equal requested num_global_parts",
1551  sumLocal == numGlobalParts, BASIC_ASSERTION);
1552 
1553  if (sumLocal == nprocs && maxLocal == 1){
1554  onePartPerProc_ = true; // user specified one part per proc
1555  return;
1556  }
1557  }
1558  else{
1559  if (maxGlobal == nprocs){
1560  onePartPerProc_ = true; // user specified num parts is num procs
1561  return;
1562  }
1563  }
1564  }
1565 
1566  // If we are here, we do not have #parts == #procs.
1567 
1568  if (sumHaveLocal == nprocs){
1569  //
1570  // We will go by the number of local parts specified.
1571  //
1572 
1573  try{
1574  procDist_.resize(nprocs+1);
1575  }
1576  catch (std::exception &e){
1577  throw(std::bad_alloc());
1578  }
1579 
1580  part_t *procArray = &procDist_[0];
1581 
1582  try{
1583  part_t tmp = part_t(numLocalParts);
1584  gatherAll<int, part_t>(*comm_, 1, &tmp, nprocs, procArray + 1);
1585  }
1586  Z2_THROW_OUTSIDE_ERROR(*env_);
1587 
1588  procArray[0] = 0;
1589 
1590  for (int proc=0; proc < nprocs; proc++)
1591  procArray[proc+1] += procArray[proc];
1592  }
1593  else{
1594  //
1595  // We will allocate global number of parts to the processes.
1596  //
1597  double fParts = numGlobalParts;
1598  double fProcs = nprocs;
1599 
1600  if (fParts < fProcs){
1601 
1602  try{
1603  partDist_.resize(size_t(fParts+1));
1604  }
1605  catch (std::exception &e){
1606  throw(std::bad_alloc());
1607  }
1608 
1609  int *partArray = &partDist_[0];
1610 
1611  double each = floor(fProcs / fParts);
1612  double extra = fmod(fProcs, fParts);
1613  partDist_[0] = 0;
1614 
1615  for (part_t part=0; part < numGlobalParts; part++){
1616  int numOwners = int(each + ((part<extra) ? 1 : 0));
1617  partArray[part+1] = partArray[part] + numOwners;
1618  }
1619 
1620  env_->globalBugAssertion(__FILE__, __LINE__, "#parts != #procs",
1621  partDist_[numGlobalParts] == nprocs, COMPLEX_ASSERTION, comm_);
1622  }
1623  else if (fParts > fProcs){
1624 
1625  // User did not specify local number of parts per proc;
1626  // Distribute the parts evenly among the procs.
1627 
1628  procDistEquallySpread_ = true;
1629 
1630  try{
1631  procDist_.resize(size_t(fProcs+1));
1632  }
1633  catch (std::exception &e){
1634  throw(std::bad_alloc());
1635  }
1636 
1637  part_t *procArray = &procDist_[0];
1638 
1639  double each = floor(fParts / fProcs);
1640  double extra = fmod(fParts, fProcs);
1641  procArray[0] = 0;
1642 
1643  for (int proc=0; proc < nprocs; proc++){
1644  part_t numParts = part_t(each + ((proc<extra) ? 1 : 0));
1645  procArray[proc+1] = procArray[proc] + numParts;
1646  }
1647 
1648  env_->globalBugAssertion(__FILE__, __LINE__, "#parts != #procs",
1649  procDist_[nprocs] == numGlobalParts, COMPLEX_ASSERTION, comm_);
1650  }
1651  else{
1652  env_->globalBugAssertion(__FILE__, __LINE__,
1653  "should never get here", 1, COMPLEX_ASSERTION, comm_);
1654  }
1655  }
1656 }
1657 
1659 // Remap a new part assignment vector for maximum overlap with an input
1660 // part assignment.
1661 //
1662 // Assumptions for this version:
1663 // input part assignment == processor rank for every local object.
1664 // assuming nGlobalParts_ <= num ranks
1665 // TODO: Write a version that takes the input part number as input;
1666 // this change requires input parts in adapters to be provided in
1667 // the Adapter.
1668 // TODO: For repartitioning, compare to old remapping results; see Zoltan1.
1669 
1670 template <typename Adapter>
1672 {
1673  size_t len = parts_.size();
1674 
1675  part_t me = comm_->getRank();
1676  int np = comm_->getSize();
1677 
1678  if (np < nGlobalParts_) {
1679  if (me == 0)
1680  std::cout << "Remapping not yet supported for "
1681  << "num_global_parts " << nGlobalParts_
1682  << " > num procs " << np << std::endl;
1683  return;
1684  }
1685  // Build edges of a bipartite graph with np + nGlobalParts_ vertices,
1686  // and edges between a process vtx and any parts to which that process'
1687  // objects are assigned.
1688  // Weight edge[parts_[i]] by the number of objects that are going from
1689  // this rank to parts_[i].
1690  // We use a std::map, assuming the number of unique parts in the parts_ array
1691  // is small to keep the binary search efficient.
1692  // TODO We use the count of objects to move; should change to SIZE of objects
1693  // to move; need SIZE function in Adapter.
1694 
1695  std::map<part_t, long> edges;
1696  long lstaying = 0; // Total num of local objects staying if we keep the
1697  // current mapping. TODO: change to SIZE of local objs
1698  long gstaying = 0; // Total num of objects staying in the current partition
1699 
1700  for (size_t i = 0; i < len; i++) {
1701  edges[parts_[i]]++; // TODO Use obj size instead of count
1702  if (parts_[i] == me) lstaying++; // TODO Use obj size instead of count
1703  }
1704 
1705  Teuchos::reduceAll<int, long>(*comm_, Teuchos::REDUCE_SUM, 1,
1706  &lstaying, &gstaying);
1707 //TODO if (gstaying == Adapter::getGlobalNumObjs()) return; // Nothing to do
1708 
1709  part_t *remap = NULL;
1710 
1711  int nedges = edges.size();
1712 
1713  // Gather the graph to rank 0.
1714  part_t tnVtx = np + nGlobalParts_; // total # vertices
1715  int *idx = NULL; // Pointer index into graph adjacencies
1716  int *sizes = NULL; // nedges per rank
1717  if (me == 0) {
1718  idx = new int[tnVtx+1];
1719  sizes = new int[np];
1720  sizes[0] = nedges;
1721  }
1722  if (np > 1)
1723  Teuchos::gather<int, int>(&nedges, 1, sizes, 1, 0, *comm_);
1724 
1725  // prefix sum to build the idx array
1726  if (me == 0) {
1727  idx[0] = 0;
1728  for (int i = 0; i < np; i++)
1729  idx[i+1] = idx[i] + sizes[i];
1730  }
1731 
1732  // prepare to send edges
1733  int cnt = 0;
1734  part_t *bufv = NULL;
1735  long *bufw = NULL;
1736  if (nedges) {
1737  bufv = new part_t[nedges];
1738  bufw = new long[nedges];
1739  // Create buffer with edges (me, part[i]) and weight edges[parts_[i]].
1740  for (typename std::map<part_t, long>::iterator it = edges.begin();
1741  it != edges.end(); it++) {
1742  bufv[cnt] = it->first; // target part
1743  bufw[cnt] = it->second; // weight
1744  cnt++;
1745  }
1746  }
1747 
1748  // Prepare to receive edges on rank 0
1749  part_t *adj = NULL;
1750  long *wgt = NULL;
1751  if (me == 0) {
1752 //SYM adj = new part_t[2*idx[np]]; // need 2x space to symmetrize later
1753 //SYM wgt = new long[2*idx[np]]; // need 2x space to symmetrize later
1754  adj = new part_t[idx[np]];
1755  wgt = new long[idx[np]];
1756  }
1757 
1758  Teuchos::gatherv<int, part_t>(bufv, cnt, adj, sizes, idx, 0, *comm_);
1759  Teuchos::gatherv<int, long>(bufw, cnt, wgt, sizes, idx, 0, *comm_);
1760  delete [] bufv;
1761  delete [] bufw;
1762 
1763  // Now have constructed graph on rank 0.
1764  // Call the matching algorithm
1765 
1766  int doRemap;
1767  if (me == 0) {
1768  // We have the "LHS" vertices of the bipartite graph; need to create
1769  // "RHS" vertices.
1770  for (int i = 0; i < idx[np]; i++) {
1771  adj[i] += np; // New RHS vertex number; offset by num LHS vertices
1772  }
1773 
1774  // Build idx for RHS vertices
1775  for (part_t i = np; i < tnVtx; i++) {
1776  idx[i+1] = idx[i]; // No edges for RHS vertices
1777  }
1778 
1779 #ifdef KDDKDD_DEBUG
1780  cout << "IDX ";
1781  for (part_t i = 0; i <= tnVtx; i++) std::cout << idx[i] << " ";
1782  std::cout << std::endl;
1783 
1784  std::cout << "ADJ ";
1785  for (part_t i = 0; i < idx[tnVtx]; i++) std::cout << adj[i] << " ";
1786  std::cout << std::endl;
1787 
1788  std::cout << "WGT ";
1789  for (part_t i = 0; i < idx[tnVtx]; i++) std::cout << wgt[i] << " ";
1790  std::cout << std::endl;
1791 #endif
1792 
1793  // Perform matching on the graph
1794  part_t *match = new part_t[tnVtx];
1795  for (part_t i = 0; i < tnVtx; i++) match[i] = i;
1796  part_t nmatches =
1797  Zoltan2::GreedyMWM<part_t, long>(idx, adj, wgt, tnVtx, match);
1798 
1799 #ifdef KDDKDD_DEBUG
1800  std::cout << "After matching: " << nmatches << " found" << std::endl;
1801  for (part_t i = 0; i < tnVtx; i++)
1802  std::cout << "match[" << i << "] = " << match[i]
1803  << ((match[i] != i &&
1804  (i < np && match[i] != i+np))
1805  ? " *" : " ")
1806  << std::endl;
1807 #endif
1808 
1809  // See whether there were nontrivial changes in the matching.
1810  bool nontrivial = false;
1811  if (nmatches) {
1812  for (part_t i = 0; i < np; i++) {
1813  if ((match[i] != i) && (match[i] != (i+np))) {
1814  nontrivial = true;
1815  break;
1816  }
1817  }
1818  }
1819 
1820  // Process the matches
1821  if (nontrivial) {
1822  remap = new part_t[nGlobalParts_];
1823  for (part_t i = 0; i < nGlobalParts_; i++) remap[i] = -1;
1824 
1825  bool *used = new bool[np];
1826  for (part_t i = 0; i < np; i++) used[i] = false;
1827 
1828  // First, process all matched parts
1829  for (part_t i = 0; i < nGlobalParts_; i++) {
1830  part_t tmp = i + np;
1831  if (match[tmp] != tmp) {
1832  remap[i] = match[tmp];
1833  used[match[tmp]] = true;
1834  }
1835  }
1836 
1837  // Second, process unmatched parts; keep same part number if possible
1838  for (part_t i = 0; i < nGlobalParts_; i++) {
1839  if (remap[i] > -1) continue;
1840  if (!used[i]) {
1841  remap[i] = i;
1842  used[i] = true;
1843  }
1844  }
1845 
1846  // Third, process unmatched parts; give them the next unused part
1847  for (part_t i = 0, uidx = 0; i < nGlobalParts_; i++) {
1848  if (remap[i] > -1) continue;
1849  while (used[uidx]) uidx++;
1850  remap[i] = uidx;
1851  used[uidx] = true;
1852  }
1853  delete [] used;
1854  }
1855  delete [] match;
1856 
1857 #ifdef KDDKDD_DEBUG
1858  cout << "Remap vector: ";
1859  for (part_t i = 0; i < nGlobalParts_; i++) cout << remap[i] << " ";
1860  std::cout << std::endl;
1861 #endif
1862 
1863  long newgstaying = measure_stays(remap, idx, adj, wgt,
1864  nGlobalParts_, np);
1865  doRemap = (newgstaying > gstaying);
1866  std::cout << "gstaying " << gstaying << " measure(input) "
1867  << measure_stays(NULL, idx, adj, wgt, nGlobalParts_, np)
1868  << " newgstaying " << newgstaying
1869  << " nontrivial " << nontrivial
1870  << " doRemap " << doRemap << std::endl;
1871  }
1872  delete [] idx;
1873  delete [] sizes;
1874  delete [] adj;
1875  delete [] wgt;
1876 
1877  Teuchos::broadcast<int, int>(*comm_, 0, 1, &doRemap);
1878 
1879  if (doRemap) {
1880  if (me != 0) remap = new part_t[nGlobalParts_];
1881  Teuchos::broadcast<int, part_t>(*comm_, 0, nGlobalParts_, remap);
1882  for (size_t i = 0; i < len; i++) {
1883  parts_[i] = remap[parts_[i]];
1884  }
1885  }
1886 
1887  delete [] remap; // TODO May want to keep for repartitioning as in Zoltan
1888 }
1889 
1890 
1891 } // namespace Zoltan2
1892 
1893 #endif
void getPartsForProc(int procId, double &numParts, part_t &partMin, part_t &partMax) const
Get the parts belonging to a process.
Defines the Solution base class.
fast typical checks for valid arguments
bool criteriaHaveSamePartSizes(int c1, int c2) const
Return true if the two weight indices have the same part size information.
#define Z2_FORWARD_EXCEPTIONS
Forward an exception back through call stack.
const part_t * getProcDistribution() const
Return a distribution by process.
void RemapParts()
Remap a new partition for maximum overlap with an input partition.
Just a placeholder for now.
more involved, like validate a graph
int getNumberOfCriteria() const
Get the number of criteria (object weights)
sub-steps, each method&#39;s entry and exit
const RCP< const Comm< int > > & getCommunicator() const
Return the communicator associated with the solution.
const RCP< const Environment > & getEnvironment() const
Return the environment associated with the solution.
long measure_stays(part_t *remap, int *idx, part_t *adj, long *wgt, part_t nrhs, part_t nlhs)
void boxAssign(int dim, scalar_t *lower, scalar_t *upper, size_t &nPartsFound, part_t **partsFound) const
Return an array of all parts overlapping a given box in space.
dictionary vals
Definition: xml2dox.py:186
A PartitioningSolution is a solution to a partitioning problem.
scalar_t getCriteriaPartSize(int idx, part_t part) const
Get the size for a given weight index and a given part.
PartitioningSolution(const RCP< const Environment > &env, const RCP< const Comm< int > > &comm, int nUserWeights, const RCP< Algorithm< Adapter > > &algorithm=Teuchos::null)
Constructor when part sizes are not supplied.
const int * getProcListView() const
Returns the process list corresponding to the global ID list.
Algorithm defines the base class for all algorithms.
void getProcsForPart(part_t partId, part_t &procMin, part_t &procMax) const
Get the processes containing a part.
Greedy Maximal Weight Matching.
static const std::string fail
void setParts(ArrayRCP< part_t > &partList)
The algorithm uses setParts to set the solution.
part_t pointAssign(int dim, scalar_t *point) const
Return the part overlapping a given point in space;.
const int * getPartDistribution() const
Return a distribution by part.
size_t getActualGlobalNumberOfParts() const
Returns the actual global number of parts provided in setParts().
const part_t * getPartListView() const
Returns the part list corresponding to the global ID list.
size_t getLocalNumberOfParts() const
Returns the number of parts to be assigned to this process.
std::vector< Zoltan2::coordinateModelPartBox< scalar_t, part_t > > & getPartBoxesView() const
returns the part box boundary list.
Defines the Environment class.
bool oneToOnePartDistribution() const
Is the part-to-process distribution is one-to-one.
bool criteriaHasUniformPartSizes(int idx) const
Determine if balancing criteria has uniform part sizes. (User can specify differing part sizes...
size_t getTargetGlobalNumberOfParts() const
Returns the global number of parts desired in the solution.
#define Z2_THROW_OUTSIDE_ERROR(env)
Throw an error returned from outside the Zoltan2 library.
void getCommunicationGraph(ArrayRCP< part_t > &comXAdj, ArrayRCP< part_t > &comAdj) const
returns communication graph resulting from geometric partitioning.
#define epsilon
scalar_t getLocalFractionOfPart() const
If parts are divided across processes, return the fraction of a part on this process.