df/d5e/chetrd__hb2st_8_f_source.html

*> \brief \b CHBTRD_HB2ST reduces a complex Hermitian band matrix A to real symmetric tridiagonal form T

*

*  =========== DOCUMENTATION ===========

*

* Online html documentation available at

*            http://www.netlib.org/lapack/explore-html/

*

*> \htmlonly

*> Download CHBTRD_HB2ST + dependencies

*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.tgz?format=tgz&filename=/lapack/lapack_routine/chbtrd_hb2st.f">

*> [TGZ]</a>

*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.zip?format=zip&filename=/lapack/lapack_routine/chbtrd_hb2st.f">

*> [ZIP]</a>

*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.txt?format=txt&filename=/lapack/lapack_routine/chbtrd_hb2st.f">

*> [TXT]</a>

*> \endhtmlonly

*

*  Definition:

*  ===========

*

*       SUBROUTINE CHETRD_HB2ST( STAGE1, VECT, UPLO, N, KD, AB, LDAB,

*                               D, E, HOUS, LHOUS, WORK, LWORK, INFO )

*

*       #if defined(_OPENMP)

*       use omp_lib

*       #endif

*

*       IMPLICIT NONE

*

*       .. Scalar Arguments ..

*       CHARACTER          STAGE1, UPLO, VECT

*       INTEGER            N, KD, IB, LDAB, LHOUS, LWORK, INFO

*       ..

*       .. Array Arguments ..

*       REAL               D( * ), E( * )

*       COMPLEX            AB( LDAB, * ), HOUS( * ), WORK( * )

*       ..

*

*

*> \par Purpose:

*  =============

*>

*> \verbatim

*>

*> CHETRD_HB2ST reduces a complex Hermitian band matrix A to real symmetric

*> tridiagonal form T by a unitary similarity transformation:

*> Q**H * A * Q = T.

*> \endverbatim

*

*  Arguments:

*  ==========

*

*> \param[in] STAGE1

*> \verbatim

*>          STAGE1 is CHARACTER*1

*>          = 'N':  "No": to mention that the stage 1 of the reduction

*>                  from dense to band using the chetrd_he2hb routine

*>                  was not called before this routine to reproduce AB.

*>                  In other term this routine is called as standalone.

*>          = 'Y':  "Yes": to mention that the stage 1 of the

*>                  reduction from dense to band using the chetrd_he2hb

*>                  routine has been called to produce AB (e.g., AB is

*>                  the output of chetrd_he2hb.

*> \endverbatim

*>

*> \param[in] VECT

*> \verbatim

*>          VECT is CHARACTER*1

*>          = 'N':  No need for the Housholder representation,

*>                  and thus LHOUS is of size max(1, 4*N);

*>          = 'V':  the Householder representation is needed to

*>                  either generate or to apply Q later on,

*>                  then LHOUS is to be queried and computed.

*>                  (NOT AVAILABLE IN THIS RELEASE).

*> \endverbatim

*>

*> \param[in] UPLO

*> \verbatim

*>          UPLO is CHARACTER*1

*>          = 'U':  Upper triangle of A is stored;

*>          = 'L':  Lower triangle of A is stored.

*> \endverbatim

*>

*> \param[in] N

*> \verbatim

*>          N is INTEGER

*>          The order of the matrix A.  N >= 0.

*> \endverbatim

*>

*> \param[in] KD

*> \verbatim

*>          KD is INTEGER

*>          The number of superdiagonals of the matrix A if UPLO = 'U',

*>          or the number of subdiagonals if UPLO = 'L'.  KD >= 0.

*> \endverbatim

*>

*> \param[in,out] AB

*> \verbatim

*>          AB is COMPLEX array, dimension (LDAB,N)

*>          On entry, the upper or lower triangle of the Hermitian band

*>          matrix A, stored in the first KD+1 rows of the array.  The

*>          j-th column of A is stored in the j-th column of the array AB

*>          as follows:

*>          if UPLO = 'U', AB(kd+1+i-j,j) = A(i,j) for max(1,j-kd)<=i<=j;

*>          if UPLO = 'L', AB(1+i-j,j)    = A(i,j) for j<=i<=min(n,j+kd).

*>          On exit, the diagonal elements of AB are overwritten by the

*>          diagonal elements of the tridiagonal matrix T; if KD > 0, the

*>          elements on the first superdiagonal (if UPLO = 'U') or the

*>          first subdiagonal (if UPLO = 'L') are overwritten by the

*>          off-diagonal elements of T; the rest of AB is overwritten by

*>          values generated during the reduction.

*> \endverbatim

*>

*> \param[in] LDAB

*> \verbatim

*>          LDAB is INTEGER

*>          The leading dimension of the array AB.  LDAB >= KD+1.

*> \endverbatim

*>

*> \param[out] D

*> \verbatim

*>          D is REAL array, dimension (N)

*>          The diagonal elements of the tridiagonal matrix T.

*> \endverbatim

*>

*> \param[out] E

*> \verbatim

*>          E is REAL array, dimension (N-1)

*>          The off-diagonal elements of the tridiagonal matrix T:

*>          E(i) = T(i,i+1) if UPLO = 'U'; E(i) = T(i+1,i) if UPLO = 'L'.

*> \endverbatim

*>

*> \param[out] HOUS

*> \verbatim

*>          HOUS is COMPLEX array, dimension LHOUS, that

*>          store the Householder representation.

*> \endverbatim

*>

*> \param[in] LHOUS

*> \verbatim

*>          LHOUS is INTEGER

*>          The dimension of the array HOUS. LHOUS = MAX(1, dimension)

*>          If LWORK = -1, or LHOUS=-1,

*>          then a query is assumed; the routine

*>          only calculates the optimal size of the HOUS array, returns

*>          this value as the first entry of the HOUS array, and no error

*>          message related to LHOUS is issued by XERBLA.

*>          LHOUS = MAX(1, dimension) where

*>          dimension = 4*N if VECT='N'

*>          not available now if VECT='H'

*> \endverbatim

*>

*> \param[out] WORK

*> \verbatim

*>          WORK is COMPLEX array, dimension LWORK.

*> \endverbatim

*>

*> \param[in] LWORK

*> \verbatim

*>          LWORK is INTEGER

*>          The dimension of the array WORK. LWORK = MAX(1, dimension)

*>          If LWORK = -1, or LHOUS=-1,

*>          then a workspace query is assumed; the routine

*>          only calculates the optimal size of the WORK array, returns

*>          this value as the first entry of the WORK array, and no error

*>          message related to LWORK is issued by XERBLA.

*>          LWORK = MAX(1, dimension) where

*>          dimension   = (2KD+1)*N + KD*NTHREADS

*>          where KD is the blocking size of the reduction,

*>          FACTOPTNB is the blocking used by the QR or LQ

*>          algorithm, usually FACTOPTNB=128 is a good choice

*>          NTHREADS is the number of threads used when

*>          openMP compilation is enabled, otherwise =1.

*> \endverbatim

*>

*> \param[out] INFO

*> \verbatim

*>          INFO is INTEGER

*>          = 0:  successful exit

*>          < 0:  if INFO = -i, the i-th argument had an illegal value

*> \endverbatim

*

*  Authors:

*  ========

*

*> \author Univ. of Tennessee

*> \author Univ. of California Berkeley

*> \author Univ. of Colorado Denver

*> \author NAG Ltd.

*

*> \date November 2017

*

*> \ingroup complexOTHERcomputational

*

*> \par Further Details:

*  =====================

*>

*> \verbatim

*>

*>  Implemented by Azzam Haidar.

*>

*>  All details are available on technical report, SC11, SC13 papers.

*>

*>  Azzam Haidar, Hatem Ltaief, and Jack Dongarra.

*>  Parallel reduction to condensed forms for symmetric eigenvalue problems

*>  using aggregated fine-grained and memory-aware kernels. In Proceedings

*>  of 2011 International Conference for High Performance Computing,

*>  Networking, Storage and Analysis (SC '11), New York, NY, USA,

*>  Article 8 , 11 pages.

*>  http://doi.acm.org/10.1145/2063384.2063394

*>

*>  A. Haidar, J. Kurzak, P. Luszczek, 2013.

*>  An improved parallel singular value algorithm and its implementation

*>  for multicore hardware, In Proceedings of 2013 International Conference

*>  for High Performance Computing, Networking, Storage and Analysis (SC '13).

*>  Denver, Colorado, USA, 2013.

*>  Article 90, 12 pages.

*>  http://doi.acm.org/10.1145/2503210.2503292

*>

*>  A. Haidar, R. Solca, S. Tomov, T. Schulthess and J. Dongarra.

*>  A novel hybrid CPU-GPU generalized eigensolver for electronic structure

*>  calculations based on fine-grained memory aware tasks.

*>  International Journal of High Performance Computing Applications.

*>  Volume 28 Issue 2, Pages 196-209, May 2014.

*>  http://hpc.sagepub.com/content/28/2/196

*>

*> \endverbatim

*>

*  =====================================================================

      SUBROUTINE chetrd_hb2st( STAGE1, VECT, UPLO, N, KD, AB, LDAB,

     $                         D, E, HOUS, LHOUS, WORK, LWORK, INFO )

*

*

#if defined(_OPENMP)

      use omp_lib

#endif

*

      IMPLICIT NONE

*

*  -- LAPACK computational routine (version 3.8.0) --

*  -- LAPACK is a software package provided by Univ. of Tennessee,    --

*  -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--

*     November 2017

*

*     .. Scalar Arguments ..

      CHARACTER          STAGE1, UPLO, VECT

      INTEGER            N, KD, LDAB, LHOUS, LWORK, INFO

*     ..

*     .. Array Arguments ..

      REAL               D( * ), E( * )

      COMPLEX            AB( LDAB, * ), HOUS( * ), WORK( * )

*     ..

*

*  =====================================================================

*

*     .. Parameters ..

      REAL               RZERO

      COMPLEX            ZERO, ONE

      parameter( rzero = 0.0e+0,

     $                   zero = ( 0.0e+0, 0.0e+0 ),

     $                   one  = ( 1.0e+0, 0.0e+0 ) )

*     ..

*     .. Local Scalars ..

      LOGICAL            LQUERY, WANTQ, UPPER, AFTERS1

      INTEGER            I, M, K, IB, SWEEPID, MYID, SHIFT, STT, ST,

     $                   ed, stind, edind, blklastind, colpt, thed,

     $                   stepercol, grsiz, thgrsiz, thgrnb, thgrid,

     $                   nbtiles, ttype, tid, nthreads, debug,

     $                   abdpos, abofdpos, dpos, ofdpos, awpos,

     $                   inda, indw, apos, sizea, lda, indv, indtau,

     $                   sicev, sizetau, ldv, lhmin, lwmin

      REAL               ABSTMP

      COMPLEX            TMP

*     ..

*     .. External Subroutines ..

      EXTERNAL           chb2st_kernels, clacpy, claset, xerbla

*     ..

*     .. Intrinsic Functions ..

      INTRINSIC          min, max, ceiling, real

*     ..

*     .. External Functions ..

      LOGICAL            LSAME

      INTEGER            ILAENV2STAGE

      EXTERNAL           lsame, ilaenv2stage

*     ..

*     .. Executable Statements ..

*

*     Determine the minimal workspace size required.

*     Test the input parameters

*

      debug   = 0

      info    = 0

      afters1 = lsame( stage1, 'Y' )

      wantq   = lsame( vect, 'V' )

      upper   = lsame( uplo, 'U' )

      lquery  = ( lwork.EQ.-1 ) .OR. ( lhous.EQ.-1 )

*

*     Determine the block size, the workspace size and the hous size.

*

      ib     = ilaenv2stage( 2, 'CHETRD_HB2ST', vect, n, kd, -1, -1 )

      lhmin  = ilaenv2stage( 3, 'CHETRD_HB2ST', vect, n, kd, ib, -1 )

      lwmin  = ilaenv2stage( 4, 'CHETRD_HB2ST', vect, n, kd, ib, -1 )

*

      IF( .NOT.afters1 .AND. .NOT.lsame( stage1, 'N' ) ) THEN

         info = -1

      ELSE IF( .NOT.lsame( vect, 'N' ) ) THEN

         info = -2

      ELSE IF( .NOT.upper .AND. .NOT.lsame( uplo, 'L' ) ) THEN

         info = -3

      ELSE IF( n.LT.0 ) THEN

         info = -4

      ELSE IF( kd.LT.0 ) THEN

         info = -5

      ELSE IF( ldab.LT.(kd+1) ) THEN

         info = -7

      ELSE IF( lhous.LT.lhmin .AND. .NOT.lquery ) THEN

         info = -11

      ELSE IF( lwork.LT.lwmin .AND. .NOT.lquery ) THEN

         info = -13

      END IF

*

      IF( info.EQ.0 ) THEN

         hous( 1 ) = lhmin

         work( 1 ) = lwmin

      END IF

*

      IF( info.NE.0 ) THEN

         CALL xerbla( 'CHETRD_HB2ST', -info )

         RETURN

      ELSE IF( lquery ) THEN

         RETURN

      END IF

*

*     Quick return if possible

*

      IF( n.EQ.0 ) THEN

          hous( 1 ) = 1

          work( 1 ) = 1

          RETURN

      END IF

*

*     Determine pointer position

*

      ldv      = kd + ib

      sizetau  = 2 * n

      sicev    = 2 * n

      indtau   = 1

      indv     = indtau + sizetau

      lda      = 2 * kd + 1

      sizea    = lda * n

      inda     = 1

      indw     = inda + sizea

      nthreads = 1

      tid      = 0

*

      IF( upper ) THEN

          apos     = inda + kd

          awpos    = inda

          dpos     = apos + kd

          ofdpos   = dpos - 1

          abdpos   = kd + 1

          abofdpos = kd

      ELSE

          apos     = inda

          awpos    = inda + kd + 1

          dpos     = apos

          ofdpos   = dpos + 1

          abdpos   = 1

          abofdpos = 2


      ENDIF

*

*     Case KD=0:

*     The matrix is diagonal. We just copy it (convert to "real" for

*     complex because D is double and the imaginary part should be 0)

*     and store it in D. A sequential code here is better or

*     in a parallel environment it might need two cores for D and E

*

      IF( kd.EQ.0 ) THEN

          DO 30 i = 1, n

              d( i ) = real( ab( abdpos, i ) )

   30     CONTINUE

          DO 40 i = 1, n-1

              e( i ) = rzero

   40     CONTINUE

*

          hous( 1 ) = 1

          work( 1 ) = 1

          RETURN

      END IF

*

*     Case KD=1:

*     The matrix is already Tridiagonal. We have to make diagonal

*     and offdiagonal elements real, and store them in D and E.

*     For that, for real precision just copy the diag and offdiag

*     to D and E while for the COMPLEX case the bulge chasing is

*     performed to convert the hermetian tridiagonal to symmetric

*     tridiagonal. A simpler coversion formula might be used, but then

*     updating the Q matrix will be required and based if Q is generated

*     or not this might complicate the story.

*

      IF( kd.EQ.1 ) THEN

          DO 50 i = 1, n

              d( i ) = real( ab( abdpos, i ) )

   50     CONTINUE

*

*         make off-diagonal elements real and copy them to E

*

          IF( upper ) THEN

              DO 60 i = 1, n - 1

                  tmp = ab( abofdpos, i+1 )

                  abstmp = abs( tmp )

                  ab( abofdpos, i+1 ) = abstmp

                  e( i ) = abstmp

                  IF( abstmp.NE.rzero ) THEN

                     tmp = tmp / abstmp

                  ELSE

                     tmp = one

                  END IF

                  IF( i.LT.n-1 )

     $               ab( abofdpos, i+2 ) = ab( abofdpos, i+2 )*tmp

C                  IF( WANTZ ) THEN

C                     CALL CSCAL( N, CONJG( TMP ), Q( 1, I+1 ), 1 )

C                  END IF

   60         CONTINUE

          ELSE

              DO 70 i = 1, n - 1

                 tmp = ab( abofdpos, i )

                 abstmp = abs( tmp )

                 ab( abofdpos, i ) = abstmp

                 e( i ) = abstmp

                 IF( abstmp.NE.rzero ) THEN

                    tmp = tmp / abstmp

                 ELSE

                    tmp = one

                 END IF

                 IF( i.LT.n-1 )

     $              ab( abofdpos, i+1 ) = ab( abofdpos, i+1 )*tmp

C                 IF( WANTQ ) THEN

C                    CALL CSCAL( N, TMP, Q( 1, I+1 ), 1 )

C                 END IF

   70         CONTINUE

          ENDIF

*

          hous( 1 ) = 1

          work( 1 ) = 1

          RETURN

      END IF

*

*     Main code start here.

*     Reduce the hermitian band of A to a tridiagonal matrix.

*

      thgrsiz   = n

      grsiz     = 1

      shift     = 3

      nbtiles   = ceiling( real(n)/real(kd) )

      stepercol = ceiling( real(shift)/real(grsiz) )

      thgrnb    = ceiling( real(n-1)/real(thgrsiz) )

*

      CALL clacpy( "A", kd+1, n, ab, ldab, work( apos ), lda )

      CALL claset( "A", kd,   n, zero, zero, work( awpos ), lda )

*

*

*     openMP parallelisation start here

*

#if defined(_OPENMP)

!$OMP PARALLEL PRIVATE( TID, THGRID, BLKLASTIND )

!$OMP$         PRIVATE( THED, I, M, K, ST, ED, STT, SWEEPID )

!$OMP$         PRIVATE( MYID, TTYPE, COLPT, STIND, EDIND )

!$OMP$         SHARED ( UPLO, WANTQ, INDV, INDTAU, HOUS, WORK)

!$OMP$         SHARED ( N, KD, IB, NBTILES, LDA, LDV, INDA )

!$OMP$         SHARED ( STEPERCOL, THGRNB, THGRSIZ, GRSIZ, SHIFT )

!$OMP MASTER

#endif

*

*     main bulge chasing loop

*

      DO 100 thgrid = 1, thgrnb

          stt  = (thgrid-1)*thgrsiz+1

          thed = min( (stt + thgrsiz -1), (n-1))

          DO 110 i = stt, n-1

              ed = min( i, thed )

              IF( stt.GT.ed ) EXIT

              DO 120 m = 1, stepercol

                  st = stt

                  DO 130 sweepid = st, ed

                      DO 140 k = 1, grsiz

                          myid  = (i-sweepid)*(stepercol*grsiz)

     $                           + (m-1)*grsiz + k

                          IF ( myid.EQ.1 ) THEN

                              ttype = 1

                          ELSE

                              ttype = mod( myid, 2 ) + 2

                          ENDIF


                          IF( ttype.EQ.2 ) THEN

                              colpt      = (myid/2)*kd + sweepid

                              stind      = colpt-kd+1

                              edind      = min(colpt,n)

                              blklastind = colpt

                          ELSE

                              colpt      = ((myid+1)/2)*kd + sweepid

                              stind      = colpt-kd+1

                              edind      = min(colpt,n)

                              IF( ( stind.GE.edind-1 ).AND.

     $                            ( edind.EQ.n ) ) THEN

                                  blklastind = n

                              ELSE

                                  blklastind = 0

                              ENDIF

                          ENDIF

*

*                         Call the kernel

*

#if defined(_OPENMP)

                          IF( ttype.NE.1 ) THEN

!$OMP TASK DEPEND(in:WORK(MYID+SHIFT-1))

!$OMP$     DEPEND(in:WORK(MYID-1))

!$OMP$     DEPEND(out:WORK(MYID))

                              tid      = omp_get_thread_num()

                              CALL chb2st_kernels( uplo, wantq, ttype,

     $                             stind, edind, sweepid, n, kd, ib,

     $                             work( inda ), lda,

     $                             hous( indv ), hous( indtau ), ldv,

     $                             work( indw + tid*kd ) )

!$OMP END TASK

                          ELSE

!$OMP TASK DEPEND(in:WORK(MYID+SHIFT-1))

!$OMP$     DEPEND(out:WORK(MYID))

                              tid      = omp_get_thread_num()

                              CALL chb2st_kernels( uplo, wantq, ttype,

     $                             stind, edind, sweepid, n, kd, ib,

     $                             work( inda ), lda,

     $                             hous( indv ), hous( indtau ), ldv,

     $                             work( indw + tid*kd ) )

!$OMP END TASK

                          ENDIF

#else

                          CALL chb2st_kernels( uplo, wantq, ttype,

     $                         stind, edind, sweepid, n, kd, ib,

     $                         work( inda ), lda,

     $                         hous( indv ), hous( indtau ), ldv,

     $                         work( indw + tid*kd ) )

#endif

                          IF ( blklastind.GE.(n-1) ) THEN

                              stt = stt + 1

                              EXIT

                          ENDIF

  140                 CONTINUE

  130             CONTINUE

  120         CONTINUE

  110     CONTINUE

  100 CONTINUE

*

#if defined(_OPENMP)

!$OMP END MASTER

!$OMP END PARALLEL

#endif

*

*     Copy the diagonal from A to D. Note that D is REAL thus only

*     the Real part is needed, the imaginary part should be zero.

*

      DO 150 i = 1, n

          d( i ) = real( work( dpos+(i-1)*lda ) )

  150 CONTINUE

*

*     Copy the off diagonal from A to E. Note that E is REAL thus only

*     the Real part is needed, the imaginary part should be zero.

*

      IF( upper ) THEN

          DO 160 i = 1, n-1

             e( i ) = real( work( ofdpos+i*lda ) )

  160     CONTINUE

      ELSE

          DO 170 i = 1, n-1

             e( i ) = real( work( ofdpos+(i-1)*lda ) )

  170     CONTINUE

      ENDIF

*

      hous( 1 ) = lhmin

      work( 1 ) = lwmin

      RETURN

*

*     End of CHETRD_HB2ST

*

      END