LAPACK  3.9.0
LAPACK: Linear Algebra PACKage
chetrd_hb2st.F
Go to the documentation of this file.
1 *> \brief \b CHBTRD_HB2ST reduces a complex Hermitian band matrix A to real symmetric tridiagonal form T
2 *
3 * =========== DOCUMENTATION ===========
4 *
5 * Online html documentation available at
6 * http://www.netlib.org/lapack/explore-html/
7 *
8 *> \htmlonly
9 *> Download CHBTRD_HB2ST + dependencies
10 *> <a href="http://www.netlib.org/cgi-bin/netlibfiles.tgz?format=tgz&filename=/lapack/lapack_routine/chbtrd_hb2st.f">
11 *> [TGZ]</a>
12 *> <a href="http://www.netlib.org/cgi-bin/netlibfiles.zip?format=zip&filename=/lapack/lapack_routine/chbtrd_hb2st.f">
13 *> [ZIP]</a>
14 *> <a href="http://www.netlib.org/cgi-bin/netlibfiles.txt?format=txt&filename=/lapack/lapack_routine/chbtrd_hb2st.f">
15 *> [TXT]</a>
16 *> \endhtmlonly
17 *
18 * Definition:
19 * ===========
20 *
21 * SUBROUTINE CHETRD_HB2ST( STAGE1, VECT, UPLO, N, KD, AB, LDAB,
22 * D, E, HOUS, LHOUS, WORK, LWORK, INFO )
23 *
24 * #if defined(_OPENMP)
25 * use omp_lib
26 * #endif
27 *
28 * IMPLICIT NONE
29 *
30 * .. Scalar Arguments ..
31 * CHARACTER STAGE1, UPLO, VECT
32 * INTEGER N, KD, IB, LDAB, LHOUS, LWORK, INFO
33 * ..
34 * .. Array Arguments ..
35 * REAL D( * ), E( * )
36 * COMPLEX AB( LDAB, * ), HOUS( * ), WORK( * )
37 * ..
38 *
39 *
40 *> \par Purpose:
41 * =============
42 *>
43 *> \verbatim
44 *>
45 *> CHETRD_HB2ST reduces a complex Hermitian band matrix A to real symmetric
46 *> tridiagonal form T by a unitary similarity transformation:
47 *> Q**H * A * Q = T.
48 *> \endverbatim
49 *
50 * Arguments:
51 * ==========
52 *
53 *> \param[in] STAGE1
54 *> \verbatim
55 *> STAGE1 is CHARACTER*1
56 *> = 'N': "No": to mention that the stage 1 of the reduction
57 *> from dense to band using the chetrd_he2hb routine
58 *> was not called before this routine to reproduce AB.
59 *> In other term this routine is called as standalone.
60 *> = 'Y': "Yes": to mention that the stage 1 of the
61 *> reduction from dense to band using the chetrd_he2hb
62 *> routine has been called to produce AB (e.g., AB is
63 *> the output of chetrd_he2hb.
64 *> \endverbatim
65 *>
66 *> \param[in] VECT
67 *> \verbatim
68 *> VECT is CHARACTER*1
69 *> = 'N': No need for the Housholder representation,
70 *> and thus LHOUS is of size max(1, 4*N);
71 *> = 'V': the Householder representation is needed to
72 *> either generate or to apply Q later on,
73 *> then LHOUS is to be queried and computed.
74 *> (NOT AVAILABLE IN THIS RELEASE).
75 *> \endverbatim
76 *>
77 *> \param[in] UPLO
78 *> \verbatim
79 *> UPLO is CHARACTER*1
80 *> = 'U': Upper triangle of A is stored;
81 *> = 'L': Lower triangle of A is stored.
82 *> \endverbatim
83 *>
84 *> \param[in] N
85 *> \verbatim
86 *> N is INTEGER
87 *> The order of the matrix A. N >= 0.
88 *> \endverbatim
89 *>
90 *> \param[in] KD
91 *> \verbatim
92 *> KD is INTEGER
93 *> The number of superdiagonals of the matrix A if UPLO = 'U',
94 *> or the number of subdiagonals if UPLO = 'L'. KD >= 0.
95 *> \endverbatim
96 *>
97 *> \param[in,out] AB
98 *> \verbatim
99 *> AB is COMPLEX array, dimension (LDAB,N)
100 *> On entry, the upper or lower triangle of the Hermitian band
101 *> matrix A, stored in the first KD+1 rows of the array. The
102 *> j-th column of A is stored in the j-th column of the array AB
103 *> as follows:
104 *> if UPLO = 'U', AB(kd+1+i-j,j) = A(i,j) for max(1,j-kd)<=i<=j;
105 *> if UPLO = 'L', AB(1+i-j,j) = A(i,j) for j<=i<=min(n,j+kd).
106 *> On exit, the diagonal elements of AB are overwritten by the
107 *> diagonal elements of the tridiagonal matrix T; if KD > 0, the
108 *> elements on the first superdiagonal (if UPLO = 'U') or the
109 *> first subdiagonal (if UPLO = 'L') are overwritten by the
110 *> off-diagonal elements of T; the rest of AB is overwritten by
111 *> values generated during the reduction.
112 *> \endverbatim
113 *>
114 *> \param[in] LDAB
115 *> \verbatim
116 *> LDAB is INTEGER
117 *> The leading dimension of the array AB. LDAB >= KD+1.
118 *> \endverbatim
119 *>
120 *> \param[out] D
121 *> \verbatim
122 *> D is REAL array, dimension (N)
123 *> The diagonal elements of the tridiagonal matrix T.
124 *> \endverbatim
125 *>
126 *> \param[out] E
127 *> \verbatim
128 *> E is REAL array, dimension (N-1)
129 *> The off-diagonal elements of the tridiagonal matrix T:
130 *> E(i) = T(i,i+1) if UPLO = 'U'; E(i) = T(i+1,i) if UPLO = 'L'.
131 *> \endverbatim
132 *>
133 *> \param[out] HOUS
134 *> \verbatim
135 *> HOUS is COMPLEX array, dimension LHOUS, that
136 *> store the Householder representation.
137 *> \endverbatim
138 *>
139 *> \param[in] LHOUS
140 *> \verbatim
141 *> LHOUS is INTEGER
142 *> The dimension of the array HOUS. LHOUS = MAX(1, dimension)
143 *> If LWORK = -1, or LHOUS=-1,
144 *> then a query is assumed; the routine
145 *> only calculates the optimal size of the HOUS array, returns
146 *> this value as the first entry of the HOUS array, and no error
147 *> message related to LHOUS is issued by XERBLA.
148 *> LHOUS = MAX(1, dimension) where
149 *> dimension = 4*N if VECT='N'
150 *> not available now if VECT='H'
151 *> \endverbatim
152 *>
153 *> \param[out] WORK
154 *> \verbatim
155 *> WORK is COMPLEX array, dimension LWORK.
156 *> \endverbatim
157 *>
158 *> \param[in] LWORK
159 *> \verbatim
160 *> LWORK is INTEGER
161 *> The dimension of the array WORK. LWORK = MAX(1, dimension)
162 *> If LWORK = -1, or LHOUS=-1,
163 *> then a workspace query is assumed; the routine
164 *> only calculates the optimal size of the WORK array, returns
165 *> this value as the first entry of the WORK array, and no error
166 *> message related to LWORK is issued by XERBLA.
167 *> LWORK = MAX(1, dimension) where
168 *> dimension = (2KD+1)*N + KD*NTHREADS
169 *> where KD is the blocking size of the reduction,
170 *> FACTOPTNB is the blocking used by the QR or LQ
171 *> algorithm, usually FACTOPTNB=128 is a good choice
172 *> NTHREADS is the number of threads used when
173 *> openMP compilation is enabled, otherwise =1.
174 *> \endverbatim
175 *>
176 *> \param[out] INFO
177 *> \verbatim
178 *> INFO is INTEGER
179 *> = 0: successful exit
180 *> < 0: if INFO = -i, the i-th argument had an illegal value
181 *> \endverbatim
182 *
183 * Authors:
184 * ========
185 *
186 *> \author Univ. of Tennessee
187 *> \author Univ. of California Berkeley
188 *> \author Univ. of Colorado Denver
189 *> \author NAG Ltd.
190 *
191 *> \date November 2017
192 *
193 *> \ingroup complexOTHERcomputational
194 *
195 *> \par Further Details:
196 * =====================
197 *>
198 *> \verbatim
199 *>
200 *> Implemented by Azzam Haidar.
201 *>
202 *> All details are available on technical report, SC11, SC13 papers.
203 *>
204 *> Azzam Haidar, Hatem Ltaief, and Jack Dongarra.
205 *> Parallel reduction to condensed forms for symmetric eigenvalue problems
206 *> using aggregated fine-grained and memory-aware kernels. In Proceedings
207 *> of 2011 International Conference for High Performance Computing,
208 *> Networking, Storage and Analysis (SC '11), New York, NY, USA,
209 *> Article 8 , 11 pages.
210 *> http://doi.acm.org/10.1145/2063384.2063394
211 *>
212 *> A. Haidar, J. Kurzak, P. Luszczek, 2013.
213 *> An improved parallel singular value algorithm and its implementation
214 *> for multicore hardware, In Proceedings of 2013 International Conference
215 *> for High Performance Computing, Networking, Storage and Analysis (SC '13).
216 *> Denver, Colorado, USA, 2013.
217 *> Article 90, 12 pages.
218 *> http://doi.acm.org/10.1145/2503210.2503292
219 *>
220 *> A. Haidar, R. Solca, S. Tomov, T. Schulthess and J. Dongarra.
221 *> A novel hybrid CPU-GPU generalized eigensolver for electronic structure
222 *> calculations based on fine-grained memory aware tasks.
223 *> International Journal of High Performance Computing Applications.
224 *> Volume 28 Issue 2, Pages 196-209, May 2014.
225 *> http://hpc.sagepub.com/content/28/2/196
226 *>
227 *> \endverbatim
228 *>
229 * =====================================================================
230  SUBROUTINE chetrd_hb2st( STAGE1, VECT, UPLO, N, KD, AB, LDAB,
231  $ D, E, HOUS, LHOUS, WORK, LWORK, INFO )
232 *
233 *
234 #if defined(_OPENMP)
235  use omp_lib
236 #endif
237 *
238  IMPLICIT NONE
239 *
240 * -- LAPACK computational routine (version 3.8.0) --
241 * -- LAPACK is a software package provided by Univ. of Tennessee, --
242 * -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
243 * November 2017
244 *
245 * .. Scalar Arguments ..
246  CHARACTER STAGE1, UPLO, VECT
247  INTEGER N, KD, LDAB, LHOUS, LWORK, INFO
248 * ..
249 * .. Array Arguments ..
250  REAL D( * ), E( * )
251  COMPLEX AB( LDAB, * ), HOUS( * ), WORK( * )
252 * ..
253 *
254 * =====================================================================
255 *
256 * .. Parameters ..
257  REAL RZERO
258  COMPLEX ZERO, ONE
259  parameter( rzero = 0.0e+0,
260  $ zero = ( 0.0e+0, 0.0e+0 ),
261  $ one = ( 1.0e+0, 0.0e+0 ) )
262 * ..
263 * .. Local Scalars ..
264  LOGICAL LQUERY, WANTQ, UPPER, AFTERS1
265  INTEGER I, M, K, IB, SWEEPID, MYID, SHIFT, STT, ST,
266  $ ed, stind, edind, blklastind, colpt, thed,
267  $ stepercol, grsiz, thgrsiz, thgrnb, thgrid,
268  $ nbtiles, ttype, tid, nthreads, debug,
269  $ abdpos, abofdpos, dpos, ofdpos, awpos,
270  $ inda, indw, apos, sizea, lda, indv, indtau,
271  $ sicev, sizetau, ldv, lhmin, lwmin
272  REAL ABSTMP
273  COMPLEX TMP
274 * ..
275 * .. External Subroutines ..
276  EXTERNAL chb2st_kernels, clacpy, claset, xerbla
277 * ..
278 * .. Intrinsic Functions ..
279  INTRINSIC min, max, ceiling, real
280 * ..
281 * .. External Functions ..
282  LOGICAL LSAME
283  INTEGER ILAENV2STAGE
284  EXTERNAL lsame, ilaenv2stage
285 * ..
286 * .. Executable Statements ..
287 *
288 * Determine the minimal workspace size required.
289 * Test the input parameters
290 *
291  debug = 0
292  info = 0
293  afters1 = lsame( stage1, 'Y' )
294  wantq = lsame( vect, 'V' )
295  upper = lsame( uplo, 'U' )
296  lquery = ( lwork.EQ.-1 ) .OR. ( lhous.EQ.-1 )
297 *
298 * Determine the block size, the workspace size and the hous size.
299 *
300  ib = ilaenv2stage( 2, 'CHETRD_HB2ST', vect, n, kd, -1, -1 )
301  lhmin = ilaenv2stage( 3, 'CHETRD_HB2ST', vect, n, kd, ib, -1 )
302  lwmin = ilaenv2stage( 4, 'CHETRD_HB2ST', vect, n, kd, ib, -1 )
303 *
304  IF( .NOT.afters1 .AND. .NOT.lsame( stage1, 'N' ) ) THEN
305  info = -1
306  ELSE IF( .NOT.lsame( vect, 'N' ) ) THEN
307  info = -2
308  ELSE IF( .NOT.upper .AND. .NOT.lsame( uplo, 'L' ) ) THEN
309  info = -3
310  ELSE IF( n.LT.0 ) THEN
311  info = -4
312  ELSE IF( kd.LT.0 ) THEN
313  info = -5
314  ELSE IF( ldab.LT.(kd+1) ) THEN
315  info = -7
316  ELSE IF( lhous.LT.lhmin .AND. .NOT.lquery ) THEN
317  info = -11
318  ELSE IF( lwork.LT.lwmin .AND. .NOT.lquery ) THEN
319  info = -13
320  END IF
321 *
322  IF( info.EQ.0 ) THEN
323  hous( 1 ) = lhmin
324  work( 1 ) = lwmin
325  END IF
326 *
327  IF( info.NE.0 ) THEN
328  CALL xerbla( 'CHETRD_HB2ST', -info )
329  RETURN
330  ELSE IF( lquery ) THEN
331  RETURN
332  END IF
333 *
334 * Quick return if possible
335 *
336  IF( n.EQ.0 ) THEN
337  hous( 1 ) = 1
338  work( 1 ) = 1
339  RETURN
340  END IF
341 *
342 * Determine pointer position
343 *
344  ldv = kd + ib
345  sizetau = 2 * n
346  sicev = 2 * n
347  indtau = 1
348  indv = indtau + sizetau
349  lda = 2 * kd + 1
350  sizea = lda * n
351  inda = 1
352  indw = inda + sizea
353  nthreads = 1
354  tid = 0
355 *
356  IF( upper ) THEN
357  apos = inda + kd
358  awpos = inda
359  dpos = apos + kd
360  ofdpos = dpos - 1
361  abdpos = kd + 1
362  abofdpos = kd
363  ELSE
364  apos = inda
365  awpos = inda + kd + 1
366  dpos = apos
367  ofdpos = dpos + 1
368  abdpos = 1
369  abofdpos = 2
370 
371  ENDIF
372 *
373 * Case KD=0:
374 * The matrix is diagonal. We just copy it (convert to "real" for
375 * complex because D is double and the imaginary part should be 0)
376 * and store it in D. A sequential code here is better or
377 * in a parallel environment it might need two cores for D and E
378 *
379  IF( kd.EQ.0 ) THEN
380  DO 30 i = 1, n
381  d( i ) = real( ab( abdpos, i ) )
382  30 CONTINUE
383  DO 40 i = 1, n-1
384  e( i ) = rzero
385  40 CONTINUE
386 *
387  hous( 1 ) = 1
388  work( 1 ) = 1
389  RETURN
390  END IF
391 *
392 * Case KD=1:
393 * The matrix is already Tridiagonal. We have to make diagonal
394 * and offdiagonal elements real, and store them in D and E.
395 * For that, for real precision just copy the diag and offdiag
396 * to D and E while for the COMPLEX case the bulge chasing is
397 * performed to convert the hermetian tridiagonal to symmetric
398 * tridiagonal. A simpler coversion formula might be used, but then
399 * updating the Q matrix will be required and based if Q is generated
400 * or not this might complicate the story.
401 *
402  IF( kd.EQ.1 ) THEN
403  DO 50 i = 1, n
404  d( i ) = real( ab( abdpos, i ) )
405  50 CONTINUE
406 *
407 * make off-diagonal elements real and copy them to E
408 *
409  IF( upper ) THEN
410  DO 60 i = 1, n - 1
411  tmp = ab( abofdpos, i+1 )
412  abstmp = abs( tmp )
413  ab( abofdpos, i+1 ) = abstmp
414  e( i ) = abstmp
415  IF( abstmp.NE.rzero ) THEN
416  tmp = tmp / abstmp
417  ELSE
418  tmp = one
419  END IF
420  IF( i.LT.n-1 )
421  $ ab( abofdpos, i+2 ) = ab( abofdpos, i+2 )*tmp
422 C IF( WANTZ ) THEN
423 C CALL CSCAL( N, CONJG( TMP ), Q( 1, I+1 ), 1 )
424 C END IF
425  60 CONTINUE
426  ELSE
427  DO 70 i = 1, n - 1
428  tmp = ab( abofdpos, i )
429  abstmp = abs( tmp )
430  ab( abofdpos, i ) = abstmp
431  e( i ) = abstmp
432  IF( abstmp.NE.rzero ) THEN
433  tmp = tmp / abstmp
434  ELSE
435  tmp = one
436  END IF
437  IF( i.LT.n-1 )
438  $ ab( abofdpos, i+1 ) = ab( abofdpos, i+1 )*tmp
439 C IF( WANTQ ) THEN
440 C CALL CSCAL( N, TMP, Q( 1, I+1 ), 1 )
441 C END IF
442  70 CONTINUE
443  ENDIF
444 *
445  hous( 1 ) = 1
446  work( 1 ) = 1
447  RETURN
448  END IF
449 *
450 * Main code start here.
451 * Reduce the hermitian band of A to a tridiagonal matrix.
452 *
453  thgrsiz = n
454  grsiz = 1
455  shift = 3
456  nbtiles = ceiling( real(n)/real(kd) )
457  stepercol = ceiling( real(shift)/real(grsiz) )
458  thgrnb = ceiling( real(n-1)/real(thgrsiz) )
459 *
460  CALL clacpy( "A", kd+1, n, ab, ldab, work( apos ), lda )
461  CALL claset( "A", kd, n, zero, zero, work( awpos ), lda )
462 *
463 *
464 * openMP parallelisation start here
465 *
466 #if defined(_OPENMP)
467 !$OMP PARALLEL PRIVATE( TID, THGRID, BLKLASTIND )
468 !$OMP$ PRIVATE( THED, I, M, K, ST, ED, STT, SWEEPID )
469 !$OMP$ PRIVATE( MYID, TTYPE, COLPT, STIND, EDIND )
470 !$OMP$ SHARED ( UPLO, WANTQ, INDV, INDTAU, HOUS, WORK)
471 !$OMP$ SHARED ( N, KD, IB, NBTILES, LDA, LDV, INDA )
472 !$OMP$ SHARED ( STEPERCOL, THGRNB, THGRSIZ, GRSIZ, SHIFT )
473 !$OMP MASTER
474 #endif
475 *
476 * main bulge chasing loop
477 *
478  DO 100 thgrid = 1, thgrnb
479  stt = (thgrid-1)*thgrsiz+1
480  thed = min( (stt + thgrsiz -1), (n-1))
481  DO 110 i = stt, n-1
482  ed = min( i, thed )
483  IF( stt.GT.ed ) EXIT
484  DO 120 m = 1, stepercol
485  st = stt
486  DO 130 sweepid = st, ed
487  DO 140 k = 1, grsiz
488  myid = (i-sweepid)*(stepercol*grsiz)
489  $ + (m-1)*grsiz + k
490  IF ( myid.EQ.1 ) THEN
491  ttype = 1
492  ELSE
493  ttype = mod( myid, 2 ) + 2
494  ENDIF
495 
496  IF( ttype.EQ.2 ) THEN
497  colpt = (myid/2)*kd + sweepid
498  stind = colpt-kd+1
499  edind = min(colpt,n)
500  blklastind = colpt
501  ELSE
502  colpt = ((myid+1)/2)*kd + sweepid
503  stind = colpt-kd+1
504  edind = min(colpt,n)
505  IF( ( stind.GE.edind-1 ).AND.
506  $ ( edind.EQ.n ) ) THEN
507  blklastind = n
508  ELSE
509  blklastind = 0
510  ENDIF
511  ENDIF
512 *
513 * Call the kernel
514 *
515 #if defined(_OPENMP)
516  IF( ttype.NE.1 ) THEN
517 !$OMP TASK DEPEND(in:WORK(MYID+SHIFT-1))
518 !$OMP$ DEPEND(in:WORK(MYID-1))
519 !$OMP$ DEPEND(out:WORK(MYID))
520  tid = omp_get_thread_num()
521  CALL chb2st_kernels( uplo, wantq, ttype,
522  $ stind, edind, sweepid, n, kd, ib,
523  $ work( inda ), lda,
524  $ hous( indv ), hous( indtau ), ldv,
525  $ work( indw + tid*kd ) )
526 !$OMP END TASK
527  ELSE
528 !$OMP TASK DEPEND(in:WORK(MYID+SHIFT-1))
529 !$OMP$ DEPEND(out:WORK(MYID))
530  tid = omp_get_thread_num()
531  CALL chb2st_kernels( uplo, wantq, ttype,
532  $ stind, edind, sweepid, n, kd, ib,
533  $ work( inda ), lda,
534  $ hous( indv ), hous( indtau ), ldv,
535  $ work( indw + tid*kd ) )
536 !$OMP END TASK
537  ENDIF
538 #else
539  CALL chb2st_kernels( uplo, wantq, ttype,
540  $ stind, edind, sweepid, n, kd, ib,
541  $ work( inda ), lda,
542  $ hous( indv ), hous( indtau ), ldv,
543  $ work( indw + tid*kd ) )
544 #endif
545  IF ( blklastind.GE.(n-1) ) THEN
546  stt = stt + 1
547  EXIT
548  ENDIF
549  140 CONTINUE
550  130 CONTINUE
551  120 CONTINUE
552  110 CONTINUE
553  100 CONTINUE
554 *
555 #if defined(_OPENMP)
556 !$OMP END MASTER
557 !$OMP END PARALLEL
558 #endif
559 *
560 * Copy the diagonal from A to D. Note that D is REAL thus only
561 * the Real part is needed, the imaginary part should be zero.
562 *
563  DO 150 i = 1, n
564  d( i ) = real( work( dpos+(i-1)*lda ) )
565  150 CONTINUE
566 *
567 * Copy the off diagonal from A to E. Note that E is REAL thus only
568 * the Real part is needed, the imaginary part should be zero.
569 *
570  IF( upper ) THEN
571  DO 160 i = 1, n-1
572  e( i ) = real( work( ofdpos+i*lda ) )
573  160 CONTINUE
574  ELSE
575  DO 170 i = 1, n-1
576  e( i ) = real( work( ofdpos+(i-1)*lda ) )
577  170 CONTINUE
578  ENDIF
579 *
580  hous( 1 ) = lhmin
581  work( 1 ) = lwmin
582  RETURN
583 *
584 * End of CHETRD_HB2ST
585 *
586  END
587 
chb2st_kernels
subroutine chb2st_kernels(UPLO, WANTZ, TTYPE, ST, ED, SWEEP, N, NB, IB, A, LDA, V, TAU, LDVT, WORK)
CHB2ST_KERNELS
Definition: chb2st_kernels.f:170
clacpy
subroutine clacpy(UPLO, M, N, A, LDA, B, LDB)
CLACPY copies all or part of one two-dimensional array to another.
Definition: clacpy.f:105
xerbla
subroutine xerbla(SRNAME, INFO)
XERBLA
Definition: xerbla.f:62
claset
subroutine claset(UPLO, M, N, ALPHA, BETA, A, LDA)
CLASET initializes the off-diagonal elements and the diagonal elements of a matrix to given values.
Definition: claset.f:108
chetrd_hb2st
subroutine chetrd_hb2st(STAGE1, VECT, UPLO, N, KD, AB, LDAB, D, E, HOUS, LHOUS, WORK, LWORK, INFO)
CHBTRD_HB2ST reduces a complex Hermitian band matrix A to real symmetric tridiagonal form T
Definition: chetrd_hb2st.F:232