LAPACK  3.9.0
LAPACK: Linear Algebra PACKage

◆ chetrd_hb2st()

subroutine chetrd_hb2st ( character  STAGE1,
character  VECT,
character  UPLO,
integer  N,
integer  KD,
complex, dimension( ldab, * )  AB,
integer  LDAB,
real, dimension( * )  D,
real, dimension( * )  E,
complex, dimension( * )  HOUS,
integer  LHOUS,
complex, dimension( * )  WORK,
integer  LWORK,
integer  INFO 
)

CHBTRD_HB2ST reduces a complex Hermitian band matrix A to real symmetric tridiagonal form T

Download CHBTRD_HB2ST + dependencies [TGZ] [ZIP] [TXT]

Purpose:
 CHETRD_HB2ST reduces a complex Hermitian band matrix A to real symmetric
 tridiagonal form T by a unitary similarity transformation:
 Q**H * A * Q = T.
Parameters
[in]STAGE1
          STAGE1 is CHARACTER*1
          = 'N':  "No": to mention that the stage 1 of the reduction  
                  from dense to band using the chetrd_he2hb routine
                  was not called before this routine to reproduce AB. 
                  In other term this routine is called as standalone. 
          = 'Y':  "Yes": to mention that the stage 1 of the 
                  reduction from dense to band using the chetrd_he2hb 
                  routine has been called to produce AB (e.g., AB is
                  the output of chetrd_he2hb.
[in]VECT
          VECT is CHARACTER*1
          = 'N':  No need for the Housholder representation, 
                  and thus LHOUS is of size max(1, 4*N);
          = 'V':  the Householder representation is needed to 
                  either generate or to apply Q later on, 
                  then LHOUS is to be queried and computed.
                  (NOT AVAILABLE IN THIS RELEASE).
[in]UPLO
          UPLO is CHARACTER*1
          = 'U':  Upper triangle of A is stored;
          = 'L':  Lower triangle of A is stored.
[in]N
          N is INTEGER
          The order of the matrix A.  N >= 0.
[in]KD
          KD is INTEGER
          The number of superdiagonals of the matrix A if UPLO = 'U',
          or the number of subdiagonals if UPLO = 'L'.  KD >= 0.
[in,out]AB
          AB is COMPLEX array, dimension (LDAB,N)
          On entry, the upper or lower triangle of the Hermitian band
          matrix A, stored in the first KD+1 rows of the array.  The
          j-th column of A is stored in the j-th column of the array AB
          as follows:
          if UPLO = 'U', AB(kd+1+i-j,j) = A(i,j) for max(1,j-kd)<=i<=j;
          if UPLO = 'L', AB(1+i-j,j)    = A(i,j) for j<=i<=min(n,j+kd).
          On exit, the diagonal elements of AB are overwritten by the
          diagonal elements of the tridiagonal matrix T; if KD > 0, the
          elements on the first superdiagonal (if UPLO = 'U') or the
          first subdiagonal (if UPLO = 'L') are overwritten by the
          off-diagonal elements of T; the rest of AB is overwritten by
          values generated during the reduction.
[in]LDAB
          LDAB is INTEGER
          The leading dimension of the array AB.  LDAB >= KD+1.
[out]D
          D is REAL array, dimension (N)
          The diagonal elements of the tridiagonal matrix T.
[out]E
          E is REAL array, dimension (N-1)
          The off-diagonal elements of the tridiagonal matrix T:
          E(i) = T(i,i+1) if UPLO = 'U'; E(i) = T(i+1,i) if UPLO = 'L'.
[out]HOUS
          HOUS is COMPLEX array, dimension LHOUS, that
          store the Householder representation.
[in]LHOUS
          LHOUS is INTEGER
          The dimension of the array HOUS. LHOUS = MAX(1, dimension)
          If LWORK = -1, or LHOUS=-1,
          then a query is assumed; the routine
          only calculates the optimal size of the HOUS array, returns
          this value as the first entry of the HOUS array, and no error
          message related to LHOUS is issued by XERBLA.
          LHOUS = MAX(1, dimension) where
          dimension = 4*N if VECT='N'
          not available now if VECT='H'     
[out]WORK
          WORK is COMPLEX array, dimension LWORK.
[in]LWORK
          LWORK is INTEGER
          The dimension of the array WORK. LWORK = MAX(1, dimension)
          If LWORK = -1, or LHOUS=-1,
          then a workspace query is assumed; the routine
          only calculates the optimal size of the WORK array, returns
          this value as the first entry of the WORK array, and no error
          message related to LWORK is issued by XERBLA.
          LWORK = MAX(1, dimension) where
          dimension   = (2KD+1)*N + KD*NTHREADS
          where KD is the blocking size of the reduction,
          FACTOPTNB is the blocking used by the QR or LQ
          algorithm, usually FACTOPTNB=128 is a good choice
          NTHREADS is the number of threads used when
          openMP compilation is enabled, otherwise =1.
[out]INFO
          INFO is INTEGER
          = 0:  successful exit
          < 0:  if INFO = -i, the i-th argument had an illegal value
Author
Univ. of Tennessee
Univ. of California Berkeley
Univ. of Colorado Denver
NAG Ltd.
Date
November 2017
Further Details:
  Implemented by Azzam Haidar.

  All details are available on technical report, SC11, SC13 papers.

  Azzam Haidar, Hatem Ltaief, and Jack Dongarra.
  Parallel reduction to condensed forms for symmetric eigenvalue problems
  using aggregated fine-grained and memory-aware kernels. In Proceedings
  of 2011 International Conference for High Performance Computing,
  Networking, Storage and Analysis (SC '11), New York, NY, USA,
  Article 8 , 11 pages.
  http://doi.acm.org/10.1145/2063384.2063394

  A. Haidar, J. Kurzak, P. Luszczek, 2013.
  An improved parallel singular value algorithm and its implementation 
  for multicore hardware, In Proceedings of 2013 International Conference
  for High Performance Computing, Networking, Storage and Analysis (SC '13).
  Denver, Colorado, USA, 2013.
  Article 90, 12 pages.
  http://doi.acm.org/10.1145/2503210.2503292

  A. Haidar, R. Solca, S. Tomov, T. Schulthess and J. Dongarra.
  A novel hybrid CPU-GPU generalized eigensolver for electronic structure 
  calculations based on fine-grained memory aware tasks.
  International Journal of High Performance Computing Applications.
  Volume 28 Issue 2, Pages 196-209, May 2014.
  http://hpc.sagepub.com/content/28/2/196 

Definition at line 232 of file chetrd_hb2st.F.

232 *
233 *
234 #if defined(_OPENMP)
235  use omp_lib
236 #endif
237 *
238  IMPLICIT NONE
239 *
240 * -- LAPACK computational routine (version 3.8.0) --
241 * -- LAPACK is a software package provided by Univ. of Tennessee, --
242 * -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
243 * November 2017
244 *
245 * .. Scalar Arguments ..
246  CHARACTER STAGE1, UPLO, VECT
247  INTEGER N, KD, LDAB, LHOUS, LWORK, INFO
248 * ..
249 * .. Array Arguments ..
250  REAL D( * ), E( * )
251  COMPLEX AB( LDAB, * ), HOUS( * ), WORK( * )
252 * ..
253 *
254 * =====================================================================
255 *
256 * .. Parameters ..
257  REAL RZERO
258  COMPLEX ZERO, ONE
259  parameter( rzero = 0.0e+0,
260  $ zero = ( 0.0e+0, 0.0e+0 ),
261  $ one = ( 1.0e+0, 0.0e+0 ) )
262 * ..
263 * .. Local Scalars ..
264  LOGICAL LQUERY, WANTQ, UPPER, AFTERS1
265  INTEGER I, M, K, IB, SWEEPID, MYID, SHIFT, STT, ST,
266  $ ED, STIND, EDIND, BLKLASTIND, COLPT, THED,
267  $ STEPERCOL, GRSIZ, THGRSIZ, THGRNB, THGRID,
268  $ NBTILES, TTYPE, TID, NTHREADS, DEBUG,
269  $ ABDPOS, ABOFDPOS, DPOS, OFDPOS, AWPOS,
270  $ INDA, INDW, APOS, SIZEA, LDA, INDV, INDTAU,
271  $ SICEV, SIZETAU, LDV, LHMIN, LWMIN
272  REAL ABSTMP
273  COMPLEX TMP
274 * ..
275 * .. External Subroutines ..
276  EXTERNAL chb2st_kernels, clacpy, claset, xerbla
277 * ..
278 * .. Intrinsic Functions ..
279  INTRINSIC min, max, ceiling, real
280 * ..
281 * .. External Functions ..
282  LOGICAL LSAME
283  INTEGER ILAENV2STAGE
284  EXTERNAL lsame, ilaenv2stage
285 * ..
286 * .. Executable Statements ..
287 *
288 * Determine the minimal workspace size required.
289 * Test the input parameters
290 *
291  debug = 0
292  info = 0
293  afters1 = lsame( stage1, 'Y' )
294  wantq = lsame( vect, 'V' )
295  upper = lsame( uplo, 'U' )
296  lquery = ( lwork.EQ.-1 ) .OR. ( lhous.EQ.-1 )
297 *
298 * Determine the block size, the workspace size and the hous size.
299 *
300  ib = ilaenv2stage( 2, 'CHETRD_HB2ST', vect, n, kd, -1, -1 )
301  lhmin = ilaenv2stage( 3, 'CHETRD_HB2ST', vect, n, kd, ib, -1 )
302  lwmin = ilaenv2stage( 4, 'CHETRD_HB2ST', vect, n, kd, ib, -1 )
303 *
304  IF( .NOT.afters1 .AND. .NOT.lsame( stage1, 'N' ) ) THEN
305  info = -1
306  ELSE IF( .NOT.lsame( vect, 'N' ) ) THEN
307  info = -2
308  ELSE IF( .NOT.upper .AND. .NOT.lsame( uplo, 'L' ) ) THEN
309  info = -3
310  ELSE IF( n.LT.0 ) THEN
311  info = -4
312  ELSE IF( kd.LT.0 ) THEN
313  info = -5
314  ELSE IF( ldab.LT.(kd+1) ) THEN
315  info = -7
316  ELSE IF( lhous.LT.lhmin .AND. .NOT.lquery ) THEN
317  info = -11
318  ELSE IF( lwork.LT.lwmin .AND. .NOT.lquery ) THEN
319  info = -13
320  END IF
321 *
322  IF( info.EQ.0 ) THEN
323  hous( 1 ) = lhmin
324  work( 1 ) = lwmin
325  END IF
326 *
327  IF( info.NE.0 ) THEN
328  CALL xerbla( 'CHETRD_HB2ST', -info )
329  RETURN
330  ELSE IF( lquery ) THEN
331  RETURN
332  END IF
333 *
334 * Quick return if possible
335 *
336  IF( n.EQ.0 ) THEN
337  hous( 1 ) = 1
338  work( 1 ) = 1
339  RETURN
340  END IF
341 *
342 * Determine pointer position
343 *
344  ldv = kd + ib
345  sizetau = 2 * n
346  sicev = 2 * n
347  indtau = 1
348  indv = indtau + sizetau
349  lda = 2 * kd + 1
350  sizea = lda * n
351  inda = 1
352  indw = inda + sizea
353  nthreads = 1
354  tid = 0
355 *
356  IF( upper ) THEN
357  apos = inda + kd
358  awpos = inda
359  dpos = apos + kd
360  ofdpos = dpos - 1
361  abdpos = kd + 1
362  abofdpos = kd
363  ELSE
364  apos = inda
365  awpos = inda + kd + 1
366  dpos = apos
367  ofdpos = dpos + 1
368  abdpos = 1
369  abofdpos = 2
370 
371  ENDIF
372 *
373 * Case KD=0:
374 * The matrix is diagonal. We just copy it (convert to "real" for
375 * complex because D is double and the imaginary part should be 0)
376 * and store it in D. A sequential code here is better or
377 * in a parallel environment it might need two cores for D and E
378 *
379  IF( kd.EQ.0 ) THEN
380  DO 30 i = 1, n
381  d( i ) = real( ab( abdpos, i ) )
382  30 CONTINUE
383  DO 40 i = 1, n-1
384  e( i ) = rzero
385  40 CONTINUE
386 *
387  hous( 1 ) = 1
388  work( 1 ) = 1
389  RETURN
390  END IF
391 *
392 * Case KD=1:
393 * The matrix is already Tridiagonal. We have to make diagonal
394 * and offdiagonal elements real, and store them in D and E.
395 * For that, for real precision just copy the diag and offdiag
396 * to D and E while for the COMPLEX case the bulge chasing is
397 * performed to convert the hermetian tridiagonal to symmetric
398 * tridiagonal. A simpler coversion formula might be used, but then
399 * updating the Q matrix will be required and based if Q is generated
400 * or not this might complicate the story.
401 *
402  IF( kd.EQ.1 ) THEN
403  DO 50 i = 1, n
404  d( i ) = real( ab( abdpos, i ) )
405  50 CONTINUE
406 *
407 * make off-diagonal elements real and copy them to E
408 *
409  IF( upper ) THEN
410  DO 60 i = 1, n - 1
411  tmp = ab( abofdpos, i+1 )
412  abstmp = abs( tmp )
413  ab( abofdpos, i+1 ) = abstmp
414  e( i ) = abstmp
415  IF( abstmp.NE.rzero ) THEN
416  tmp = tmp / abstmp
417  ELSE
418  tmp = one
419  END IF
420  IF( i.LT.n-1 )
421  $ ab( abofdpos, i+2 ) = ab( abofdpos, i+2 )*tmp
422 C IF( WANTZ ) THEN
423 C CALL CSCAL( N, CONJG( TMP ), Q( 1, I+1 ), 1 )
424 C END IF
425  60 CONTINUE
426  ELSE
427  DO 70 i = 1, n - 1
428  tmp = ab( abofdpos, i )
429  abstmp = abs( tmp )
430  ab( abofdpos, i ) = abstmp
431  e( i ) = abstmp
432  IF( abstmp.NE.rzero ) THEN
433  tmp = tmp / abstmp
434  ELSE
435  tmp = one
436  END IF
437  IF( i.LT.n-1 )
438  $ ab( abofdpos, i+1 ) = ab( abofdpos, i+1 )*tmp
439 C IF( WANTQ ) THEN
440 C CALL CSCAL( N, TMP, Q( 1, I+1 ), 1 )
441 C END IF
442  70 CONTINUE
443  ENDIF
444 *
445  hous( 1 ) = 1
446  work( 1 ) = 1
447  RETURN
448  END IF
449 *
450 * Main code start here.
451 * Reduce the hermitian band of A to a tridiagonal matrix.
452 *
453  thgrsiz = n
454  grsiz = 1
455  shift = 3
456  nbtiles = ceiling( real(n)/real(kd) )
457  stepercol = ceiling( real(shift)/real(grsiz) )
458  thgrnb = ceiling( real(n-1)/real(thgrsiz) )
459 *
460  CALL clacpy( "A", kd+1, n, ab, ldab, work( apos ), lda )
461  CALL claset( "A", kd, n, zero, zero, work( awpos ), lda )
462 *
463 *
464 * openMP parallelisation start here
465 *
466 #if defined(_OPENMP)
467 !$OMP PARALLEL PRIVATE( TID, THGRID, BLKLASTIND )
468 !$OMP$ PRIVATE( THED, I, M, K, ST, ED, STT, SWEEPID )
469 !$OMP$ PRIVATE( MYID, TTYPE, COLPT, STIND, EDIND )
470 !$OMP$ SHARED ( UPLO, WANTQ, INDV, INDTAU, HOUS, WORK)
471 !$OMP$ SHARED ( N, KD, IB, NBTILES, LDA, LDV, INDA )
472 !$OMP$ SHARED ( STEPERCOL, THGRNB, THGRSIZ, GRSIZ, SHIFT )
473 !$OMP MASTER
474 #endif
475 *
476 * main bulge chasing loop
477 *
478  DO 100 thgrid = 1, thgrnb
479  stt = (thgrid-1)*thgrsiz+1
480  thed = min( (stt + thgrsiz -1), (n-1))
481  DO 110 i = stt, n-1
482  ed = min( i, thed )
483  IF( stt.GT.ed ) EXIT
484  DO 120 m = 1, stepercol
485  st = stt
486  DO 130 sweepid = st, ed
487  DO 140 k = 1, grsiz
488  myid = (i-sweepid)*(stepercol*grsiz)
489  $ + (m-1)*grsiz + k
490  IF ( myid.EQ.1 ) THEN
491  ttype = 1
492  ELSE
493  ttype = mod( myid, 2 ) + 2
494  ENDIF
495 
496  IF( ttype.EQ.2 ) THEN
497  colpt = (myid/2)*kd + sweepid
498  stind = colpt-kd+1
499  edind = min(colpt,n)
500  blklastind = colpt
501  ELSE
502  colpt = ((myid+1)/2)*kd + sweepid
503  stind = colpt-kd+1
504  edind = min(colpt,n)
505  IF( ( stind.GE.edind-1 ).AND.
506  $ ( edind.EQ.n ) ) THEN
507  blklastind = n
508  ELSE
509  blklastind = 0
510  ENDIF
511  ENDIF
512 *
513 * Call the kernel
514 *
515 #if defined(_OPENMP)
516  IF( ttype.NE.1 ) THEN
517 !$OMP TASK DEPEND(in:WORK(MYID+SHIFT-1))
518 !$OMP$ DEPEND(in:WORK(MYID-1))
519 !$OMP$ DEPEND(out:WORK(MYID))
520  tid = omp_get_thread_num()
521  CALL chb2st_kernels( uplo, wantq, ttype,
522  $ stind, edind, sweepid, n, kd, ib,
523  $ work( inda ), lda,
524  $ hous( indv ), hous( indtau ), ldv,
525  $ work( indw + tid*kd ) )
526 !$OMP END TASK
527  ELSE
528 !$OMP TASK DEPEND(in:WORK(MYID+SHIFT-1))
529 !$OMP$ DEPEND(out:WORK(MYID))
530  tid = omp_get_thread_num()
531  CALL chb2st_kernels( uplo, wantq, ttype,
532  $ stind, edind, sweepid, n, kd, ib,
533  $ work( inda ), lda,
534  $ hous( indv ), hous( indtau ), ldv,
535  $ work( indw + tid*kd ) )
536 !$OMP END TASK
537  ENDIF
538 #else
539  CALL chb2st_kernels( uplo, wantq, ttype,
540  $ stind, edind, sweepid, n, kd, ib,
541  $ work( inda ), lda,
542  $ hous( indv ), hous( indtau ), ldv,
543  $ work( indw + tid*kd ) )
544 #endif
545  IF ( blklastind.GE.(n-1) ) THEN
546  stt = stt + 1
547  EXIT
548  ENDIF
549  140 CONTINUE
550  130 CONTINUE
551  120 CONTINUE
552  110 CONTINUE
553  100 CONTINUE
554 *
555 #if defined(_OPENMP)
556 !$OMP END MASTER
557 !$OMP END PARALLEL
558 #endif
559 *
560 * Copy the diagonal from A to D. Note that D is REAL thus only
561 * the Real part is needed, the imaginary part should be zero.
562 *
563  DO 150 i = 1, n
564  d( i ) = real( work( dpos+(i-1)*lda ) )
565  150 CONTINUE
566 *
567 * Copy the off diagonal from A to E. Note that E is REAL thus only
568 * the Real part is needed, the imaginary part should be zero.
569 *
570  IF( upper ) THEN
571  DO 160 i = 1, n-1
572  e( i ) = real( work( ofdpos+i*lda ) )
573  160 CONTINUE
574  ELSE
575  DO 170 i = 1, n-1
576  e( i ) = real( work( ofdpos+(i-1)*lda ) )
577  170 CONTINUE
578  ENDIF
579 *
580  hous( 1 ) = lhmin
581  work( 1 ) = lwmin
582  RETURN
583 *
584 * End of CHETRD_HB2ST
585 *
Here is the call graph for this function:
Here is the caller graph for this function:
chb2st_kernels
subroutine chb2st_kernels(UPLO, WANTZ, TTYPE, ST, ED, SWEEP, N, NB, IB, A, LDA, V, TAU, LDVT, WORK)
CHB2ST_KERNELS
Definition: chb2st_kernels.f:170
clacpy
subroutine clacpy(UPLO, M, N, A, LDA, B, LDB)
CLACPY copies all or part of one two-dimensional array to another.
Definition: clacpy.f:105
xerbla
subroutine xerbla(SRNAME, INFO)
XERBLA
Definition: xerbla.f:62
lsame
logical function lsame(CA, CB)
LSAME
Definition: lsame.f:55
claset
subroutine claset(UPLO, M, N, ALPHA, BETA, A, LDA)
CLASET initializes the off-diagonal elements and the diagonal elements of a matrix to given values.
Definition: claset.f:108
ilaenv2stage
integer function ilaenv2stage(ISPEC, NAME, OPTS, N1, N2, N3, N4)
ILAENV2STAGE
Definition: ilaenv2stage.f:151